• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include <android/sync.h>
22 #include <fcntl.h>
23 #include <openssl/sha.h>
24 #include <sys/stat.h>
25 #include <sys/types.h>
26 
27 #include <algorithm>
28 #include <functional>
29 #include <map>
30 #include <memory>
31 #include <mutex>
32 #include <queue>
33 #include <set>
34 #include <string>
35 #include <type_traits>
36 #include <unordered_set>
37 #include <utility>
38 #include <vector>
39 
40 #include "BurstBuilder.h"
41 #include "Callbacks.h"
42 #include "CompilationBuilder.h"
43 #include "ControlFlow.h"
44 #include "CpuExecutor.h"
45 #include "ExecutionBuilder.h"
46 #include "ExecutionBurstController.h"
47 #include "GraphDump.h"
48 #include "Manager.h"
49 #include "MetaModel.h"
50 #include "ModelBuilder.h"
51 #include "OperationsUtils.h"
52 #include "TokenHasher.h"
53 #include "Tracing.h"
54 #include "TypeManager.h"
55 #include "Utils.h"
56 
57 namespace android {
58 namespace nn {
59 
60 namespace {
61 
62 using namespace hal;
63 
64 // The index of the main model in SourceModels.
65 constexpr uint32_t kMainModelInSourceModels = 0;
66 
67 // Compiles the model on device.
68 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
69 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
70 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
71 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const std::optional<Deadline> & deadline,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<PreparedModel> * preparedModel)72 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
73             int compilationPriority, const std::optional<Deadline>& deadline,
74             const std::string& cacheDir, TokenHasher* token,
75             std::shared_ptr<PreparedModel>* preparedModel) {
76     CHECK(token != nullptr);
77     CHECK(preparedModel != nullptr);
78     *preparedModel = nullptr;
79 
80     std::optional<CacheToken> cacheToken;
81     if (device.isCachingSupported() && token->ok() &&
82         token->updateFromString(device.getName().c_str()) &&
83         token->updateFromString(device.getVersionString().c_str()) &&
84         token->update(&executionPreference, sizeof(executionPreference)) &&
85         token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
86         cacheToken.emplace(token->getCacheToken());
87     }
88 
89     const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
90     const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
91     const Priority priority = convertToHalPriority(compilationPriority);
92     const auto [n, returnedPreparedModel] =
93             device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
94     *preparedModel = returnedPreparedModel;
95     return n;
96 }
97 
98 typedef std::function<void(uint32_t)> OperationReadyCallback;
99 
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)100 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
101                            const Operand& fromOperand) {
102     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
103         fromOperand.extraParams.getDiscriminator() ==
104                 OperandExtraParams::hidl_discriminator::channelQuant) {
105         auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
106         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
107                 .channelDim = fromChannelQuant.channelDim,
108                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
109                 .scales = fromChannelQuant.scales.data(),
110         };
111         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
112     } else if (isExtensionOperandType(fromOperand.type) &&
113                fromOperand.extraParams.getDiscriminator() ==
114                        OperandExtraParams::hidl_discriminator::extension) {
115         hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
116         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
117                                              extensionData.size());
118     } else if (fromOperand.extraParams.getDiscriminator() !=
119                        OperandExtraParams::hidl_discriminator::none ||
120                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
121         LOG(ERROR) << "Type " << toString(fromOperand.type)
122                    << " has an unexpected extraParams discriminator: "
123                    << static_cast<int>(fromOperand.extraParams.getDiscriminator());
124         return ANEURALNETWORKS_BAD_DATA;
125     } else {
126         return ANEURALNETWORKS_NO_ERROR;
127     }
128 }
129 
130 // This class tracks whether we know the value of an operand as operations
131 // are processed.
132 class OperandTracker {
133    public:
134     // Creates the tracker for this model. Figure out which operations can be
135     // executed right away and cb for each one of them.
136     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
137     // Mark the specified operation as having been processed. The output
138     // of the operation now being known, this may make new operations to be
139     // able to run.  Call cb for each one of them.
140     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
141 
142    private:
143     const ModelBuilder* mModel;
144     std::multimap<uint32_t, uint32_t> mOperandToOperations;
145     std::vector<uint32_t> mUnknownInputCount;  // For each operation
146 };
147 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)148 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
149     : mModel(model) {
150     const auto& operations = mModel->getOperations();
151     mUnknownInputCount.resize(operations.size());
152     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
153         const Operation& operation = operations[operationIndex];
154         uint32_t count = 0;
155         for (uint32_t operandIndex : operation.inputs) {
156             auto lifetime = mModel->getOperand(operandIndex).lifetime;
157             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
158                 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
159                 count++;
160                 mOperandToOperations.emplace(operandIndex, operationIndex);
161             }
162         }
163         if (count == 0) {
164             cb(operationIndex);
165         }
166         mUnknownInputCount[operationIndex] = count;
167     }
168 }
169 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)170 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
171     // Mark all its outputs as known.
172     const Operation& operation = mModel->getOperations()[operationIndex];
173     for (uint32_t operandIndex : operation.outputs) {
174         auto range = mOperandToOperations.equal_range(operandIndex);
175         for (auto i = range.first; i != range.second; i++) {
176             uint32_t& count = mUnknownInputCount[i->second];
177             if (--count == 0) {
178                 cb(i->second);
179             }
180         }
181     }
182 }
183 
184 }  // namespace
185 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)186 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
187                              std::shared_ptr<Device> device)
188     : mPlan(plan),
189       mIndex(stepIndex),
190       mSourceModelIndex(sourceModelIndex),
191       mStepModel(),
192       mDevice(device),
193       mToken(plan->getCacheToken()) {}
194 
195 // Adds an operand if it has not been added already.
196 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)197 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
198                               OperandKind kind) {
199     // Have we added this operand already?
200     auto i = mOperandMap.find(sourceOperandIndex);
201     if (i != mOperandMap.end()) {
202         CHECK(kind == INPUT);
203         *stepOperandIndex = i->second;
204         return ANEURALNETWORKS_NO_ERROR;
205     }
206 
207     // First time we add this operand.
208     *stepOperandIndex = mStepModel.operandCount();
209     mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
210 
211     // Add the operand to the step model.
212     const ModelBuilder& sourceModel = *getSourceModel();
213     const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
214     ANeuralNetworksOperandType type = {
215             .type = static_cast<int32_t>(operand.type),
216             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
217             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
218             .scale = operand.scale,
219             .zeroPoint = operand.zeroPoint,
220     };
221 
222     int n = mStepModel.addOperand(type);
223     if (n != ANEURALNETWORKS_NO_ERROR) {
224         LOG(ERROR) << "Previous error occurred when partitioning the graph";
225         return n;
226     }
227 
228     n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
229     if (n != ANEURALNETWORKS_NO_ERROR) {
230         LOG(ERROR) << "Error when copying extra parameters to the operand";
231         return n;
232     }
233 
234     // Sets its value.
235     switch (operand.lifetime) {
236         case OperandLifeTime::CONSTANT_COPY: {
237             const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
238             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
239             if (n != ANEURALNETWORKS_NO_ERROR) {
240                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
241                 return n;
242             }
243         } break;
244         case OperandLifeTime::CONSTANT_REFERENCE: {
245             const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
246             n = mStepModel.setOperandValueFromMemory(
247                     *stepOperandIndex, memory, operand.location.offset, operand.location.length);
248             if (n != ANEURALNETWORKS_NO_ERROR) {
249                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
250                 return n;
251             }
252         } break;
253         case OperandLifeTime::NO_VALUE: {
254             n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
255             if (n != ANEURALNETWORKS_NO_ERROR) {
256                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
257                 return n;
258             }
259         } break;
260         case OperandLifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
261             if (kind == INPUT) {
262                 // The first time we've seen this operand is as an
263                 // input.  That means it must be defined by a
264                 // different partition, and is an input to this one.
265                 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
266             } else {
267                 // The first time we've seen this operand is as an
268                 // output.  It may be an input to a different
269                 // partition, so keep track of it.
270                 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
271                                           mIndex);
272             }
273         } break;
274         case OperandLifeTime::SUBGRAPH_INPUT: {
275             mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
276         } break;
277         case OperandLifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
278             if (kind == INPUT) {
279                 // The first time we've seen this operand is as an
280                 // input.  That means it must be defined by a
281                 // different partition, and is an input to this one.
282                 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
283             } else {
284                 // The first time we've seen this operand is as an
285                 // output.
286                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
287             }
288         } break;
289         case OperandLifeTime::SUBGRAPH: {
290             const ModelBuilder* model = sourceModel.getReferencedModel(operand);
291             n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
292             if (n != ANEURALNETWORKS_NO_ERROR) {
293                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
294                 return n;
295             }
296         } break;
297         default: {
298             CHECK(!"unexpected");
299         } break;
300     }
301 
302     return ANEURALNETWORKS_NO_ERROR;
303 }
304 
addOperation(int operationIndex)305 int ExecutionStep::addOperation(int operationIndex) {
306     const Operation& operation = getSourceModel()->getOperation(operationIndex);
307     if (mToken.ok()) {
308         mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
309         mToken.update(&operationIndex, sizeof(operationIndex));
310     }
311 
312     // Convert the input and output operand indexes.
313     //
314     // We expect operations to be added in topological order.  Therefore:
315     //
316     // - We may not have seen an input if it is a model input, a
317     //   constant, or an operand written by a different partition.
318     //
319     // - We should not have seen any outputs.
320     auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands,
321                               std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
322         const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
323         for (uint32_t i = 0; i < operandCount; i++) {
324             NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
325         }
326         return ANEURALNETWORKS_NO_ERROR;
327     };
328 
329     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
330     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
331     std::vector<uint32_t> inputs(inputCount);
332     std::vector<uint32_t> outputs(outputCount);
333     NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
334     NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
335     return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
336                                    outputCount, outputs.data());
337 }
338 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const Memory * temporaryMemory,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOffsetOfTemporary,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const339 void ExecutionStep::mapInputsAndOutputs(
340         std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
341         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
342         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
343         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
344         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
345                 sourceOperandToConstantReference) const {
346     auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
347         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
348         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
349             it != sourceOperandToOffsetOfTemporary.end()) {
350             executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
351         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
352                    it != sourceOperandToInputIndex.end()) {
353             executor->mapInput(it->second, stepInputIndex);
354         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
355                    it != sourceOperandToOutputIndex.end()) {
356             executor->mapOutputToInput(it->second, stepInputIndex);
357         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
358                    it != sourceOperandToConstantReference.end()) {
359             // Constant partition boundary operand. This could be an IF branch
360             // model input or a WHILE variable initializer.
361             executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset);
362         } else {
363             CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
364                          << toString(sourceOperandIndex);
365         }
366     };
367     auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
368         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
369         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
370             it != sourceOperandToOffsetOfTemporary.end()) {
371             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
372         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
373                    it != sourceOperandToOutputIndex.end()) {
374             executor->mapOutput(it->second, stepOutputIndex);
375         } else {
376             CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
377                          << toString(sourceOperandIndex);
378         }
379     };
380     for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
381         mapInput(mStepModelInputs[i].first, i);
382     }
383     for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
384         mapOutput(mStepModelOutputs[i].first, i);
385     }
386 }
387 
findTempsAsStepModelOutputs()388 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
389     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
390         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
391         if (it == mTemporaryToDefiningExecutionStep.end()) {
392             // The operand is not a temporary or is not defined by an
393             // ExecutionStep (i.e. it's an output of an IF or a WHILE).
394             // The latter case is handled by ExecutionPlan::makeController().
395             return;
396         }
397         uint32_t stepIndex = it->second;
398         CHECK_LT(stepIndex, mSteps.size());
399         mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
400     };
401     for (const auto& logicalStep : mSteps) {
402         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
403             for (const auto& input : step->getTempsAsStepModelInputs()) {
404                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
405                 recordAsOutputIfTemporary(sourceOperandIndex);
406             }
407         } else if (const IfStep* step = logicalStep->tryIfStep()) {
408             recordAsOutputIfTemporary(step->conditionOperandIndex);
409             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
410                 recordAsOutputIfTemporary(sourceOperandIndex);
411             }
412         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
413             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
414                 recordAsOutputIfTemporary(sourceOperandIndex);
415             }
416         } else {
417             CHECK(logicalStep->isGoto());
418         }
419     }
420 }
421 
recordTempAsStepModelOutput(uint32_t stepOperandIndex)422 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
423     const auto it = mOperandMap.find(stepOperandIndex);
424     CHECK(it != mOperandMap.end());
425     mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
426 }
427 
getSourceModel() const428 const ModelBuilder* ExecutionStep::getSourceModel() const {
429     return mPlan->getSourceModels().getModel(mSourceModelIndex);
430 }
431 
logStepModel() const432 void ExecutionStep::logStepModel() const {
433     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
434 
435     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
436         if (!toLog.empty()) {
437             toLog += ", ";
438         }
439         toLog += toString(e.first);
440         toLog += "->";
441         toLog += toString(e.second);
442     };
443 
444     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
445         std::string toLog;
446         for (const auto& e : map) {
447             logRemapEntry(toLog, e);
448         }
449         VLOG(COMPILATION) << name << ": " << toLog;
450     };
451     auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
452         std::string toLog;
453         for (const auto& e : set) {
454             logRemapEntry(toLog, e);
455         }
456         VLOG(COMPILATION) << name << ": " << toLog;
457     };
458 
459     logRemapVector("step model inputs", mStepModelInputs);
460     logRemapVector("step model outputs", mStepModelOutputs);
461     logRemapVector("model inputs", mModelInputs);
462     logRemapVector("model outputs", mModelOutputs);
463     logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
464     logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
465     logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
466 }
467 
hasUnknownSize(const Operand & operand)468 static bool hasUnknownSize(const Operand& operand) {
469     if (operand.dimensions.size() == 0) {
470         return TypeManager::get()->isTensorType(operand.type);
471     }
472     for (uint32_t dimension : operand.dimensions) {
473         if (dimension == 0) {
474             return true;
475         }
476     }
477     return false;
478 }
479 
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)480 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
481                                    int32_t executionPreference, int32_t priority) {
482     CHECK(mDevice != nullptr);
483 
484     for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
485         const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
486         if (hasUnknownSize(operand)) {
487             *hasOutputOfUnknownSize = true;
488             VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first)
489                               << " of source graph) has unknown size: " << toString(operand);
490         }
491     }
492 
493     mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
494 
495     mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
496     mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
497                             mTempsAsStepModelInputs.end());
498     mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
499                             mOutputsAsStepModelInputs.end());
500 
501     mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
502     mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
503                              mTempsAsStepModelOutputs.end());
504 
505     if (mSourceModelIndex == kMainModelInSourceModels) {
506         std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
507         for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
508             mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
509         }
510         std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
511         for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
512             mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
513         }
514 
515         // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
516         // mModelInputs being the first inputs, as specified by mStepModelInputs.
517         mInputIndexStepModelToMainModel.resize(mModelInputs.size());
518         std::transform(mModelInputs.begin(), mModelInputs.end(),
519                        mInputIndexStepModelToMainModel.begin(),
520                        [&mainModelOperandToInputIndex](auto& e) {
521                            uint32_t sourceOperandIndex = e.first;
522                            return mainModelOperandToInputIndex[sourceOperandIndex];
523                        });
524 
525         // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
526         // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
527         mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
528         std::transform(mModelOutputs.begin(), mModelOutputs.end(),
529                        mOutputIndexStepModelToMainModel.begin(),
530                        [&mainModelOperandToOutputIndex](auto& e) {
531                            uint32_t sourceOperandIndex = e.first;
532                            return mainModelOperandToOutputIndex[sourceOperandIndex];
533                        });
534 
535         // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
536         // on mOutputsAsStepModelInputs being the first outputs.
537         mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
538         std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
539                        mOutputsAsStepModelInputsIndexToMainModel.begin(),
540                        [&mainModelOperandToOutputIndex](auto& e) {
541                            uint32_t sourceOperandIndex = e.first;
542                            return mainModelOperandToOutputIndex[sourceOperandIndex];
543                        });
544     }
545 
546     if (VLOG_IS_ON(COMPILATION)) {
547         logStepModel();
548     }
549 
550     std::vector<uint32_t> inputs(mStepModelInputs.size());
551     std::vector<uint32_t> outputs(mStepModelOutputs.size());
552     std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
553                    [](auto& e) { return e.second; });
554     std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
555                    [](auto& e) { return e.second; });
556     NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
557                                                            outputs.size(), outputs.data()));
558     // TODO: Model::finish() should use ValidationMode::RUNTIME when sending the
559     // step model to CpuDevice. Right now, this is harmless because the only
560     // difference in validation occurs with control flow operations and inputs
561     // or outputs of unknown size and we never send control flow operations to
562     // CpuDevice. We need to address this if this behavior changes (b/151634976).
563     NN_RETURN_IF_ERROR(mStepModel.finish());
564 
565     // TODO: Move compilation elsewhere?
566     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
567     return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
568                    &mToken, &mPreparedStepModel);
569 }
570 
dump() const571 void ExecutionStep::dump() const {
572     if (VLOG_IS_ON(COMPILATION)) {
573         VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
574         logModelToInfo(mStepModel.makeHidlModel());
575     }
576 }
577 
toString(const IfStep & step)578 std::string toString(const IfStep& step) {
579     std::ostringstream oss;
580     oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
581         << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
582     return oss.str();
583 }
584 
toString(const WhileStep & step)585 std::string toString(const WhileStep& step) {
586     std::ostringstream oss;
587     oss << "Step#" << step.index << ": while cond=" << step.condStepIndex
588         << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
589     return oss.str();
590 }
591 
toString(const GotoStep & step)592 std::string toString(const GotoStep& step) {
593     std::ostringstream oss;
594     oss << "Step#" << step.index << ": goto " << step.gotoStepIndex;
595     return oss.str();
596 }
597 
dump() const598 void LogicalStep::dump() const {
599     if (VLOG_IS_ON(COMPILATION)) {
600         if (const IfStep* step = tryIfStep()) {
601             VLOG(COMPILATION) << toString(*step);
602         } else if (const WhileStep* step = tryWhileStep()) {
603             VLOG(COMPILATION) << toString(*step);
604         } else if (const GotoStep* step = tryGotoStep()) {
605             VLOG(COMPILATION) << toString(*step);
606         } else {
607             executionStep()->dump();
608         }
609     }
610 }
611 
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)612 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
613                                         int32_t executionPreference, int32_t priority,
614                                         const std::optional<Deadline>& deadline) {
615     CHECK(!mSuccessfulFinish);
616     CHECK(!deadline.has_value());
617     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
618 
619     auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
620         for (const auto& sourceOperandIndex : operands) {
621             const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
622             const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
623             if (hasUnknownSize(operand)) {
624                 return true;
625             }
626         }
627         return false;
628     };
629 
630     findTempsAsStepModelOutputs();
631     for (const auto& logicalStep : mSteps) {
632         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
633             int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
634                                           executionPreference, priority);
635             if (n != ANEURALNETWORKS_NO_ERROR) {
636                 VLOG(COMPILATION)
637                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
638                 return n;
639             }
640         } else if (IfStep* step = logicalStep->tryIfStep()) {
641             // The partitioner does not support dynamic temporaries (b/132458982).
642             CHECK(!containsUnknownSize(step->outerInputOperands));
643             CHECK(!containsUnknownSize(step->outerOutputOperands));
644             // step->conditionOperandIndex has a static shape. See b/158557728.
645             CHECK(!containsUnknownSize(step->thenBranchInputOperands));
646             CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
647             CHECK(!containsUnknownSize(step->elseBranchInputOperands));
648             CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
649         } else if (WhileStep* step = logicalStep->tryWhileStep()) {
650             // The partitioner does not support dynamic temporaries (b/132458982).
651             CHECK(!containsUnknownSize(step->outerInputOperands));
652             CHECK(!containsUnknownSize(step->outerOutputOperands));
653             CHECK(!containsUnknownSize(step->condInputOperands));
654             // step->condOutputOperand has a static shape. See b/158557728.
655             CHECK(!containsUnknownSize(step->bodyInputOperands));
656             CHECK(!containsUnknownSize(step->bodyOutputOperands));
657         } else {
658             CHECK(logicalStep->isGoto());
659         }
660     }
661     if (mHasStepModelOutputOfUnknownSize) {
662         VLOG(COMPILATION)
663                 << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
664         return ANEURALNETWORKS_OP_FAILED;
665     }
666 
667     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
668         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
669         mSourceOperandToInputIndex[index] = i;
670     }
671     for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
672         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
673         mSourceOperandToOutputIndex[index] = i;
674     }
675 
676     findControlFlowBoundaryConstants(sourceModels);
677 
678     mSuccessfulFinish = true;
679     return ANEURALNETWORKS_NO_ERROR;
680 }
681 
findControlFlowBoundaryConstants(const SourceModels * sourceModels)682 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
683         const SourceModels* sourceModels) {
684     auto handleBoundaryConstants = [this,
685                                     sourceModels](const SourceOperandIndex& sourceOperandIndex) {
686         const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
687         const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
688         const DataLocation& location = operand.location;
689         if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) {
690             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
691                     .buffer = sourceModel->getPointerToOperandValue(location.offset),
692                     .length = location.length,
693             };
694         } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
695             mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
696                     .memory = sourceModel->getMemories()[location.poolIndex],
697                     .offset = location.offset,
698                     .length = location.length,
699             };
700         }
701     };
702     for (const auto& logicalStep : mSteps) {
703         if (const IfStep* step = logicalStep->tryIfStep()) {
704             handleBoundaryConstants(step->conditionOperandIndex);
705             for (const auto& sourceOperandIndex : step->outerInputOperands) {
706                 handleBoundaryConstants(sourceOperandIndex);
707             }
708         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
709             for (const auto& sourceOperandIndex : step->outerInputOperands) {
710                 handleBoundaryConstants(sourceOperandIndex);
711             }
712         }
713     }
714 }
715 
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)716 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
717                                       int32_t priority, const std::optional<Deadline>& deadline) {
718     CHECK(!mSuccessfulFinish);
719     CHECK(mDevice != nullptr);
720     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
721     const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
722                           &mToken, &mPreparedModel);
723     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
724     return n;
725 }
726 
finish(int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)727 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
728                           const std::optional<Deadline>& deadline) {
729     CHECK(mBody != nullptr);
730     return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
731 }
732 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder)733 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
734                                       const BurstBuilder* burstBuilder)
735     : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
736 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference)737 ExecutionPlan::Controller::Controller(
738         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
739         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
740         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
741         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
742         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
743         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
744         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
745         std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
746     : mPlan(plan),
747       mExecutionBuilder(executionBuilder),
748       mBurstBuilder(burstBuilder),
749       mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
750       mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)),
751       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
752       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
753       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
754       mNextStepIndex(0),
755       mFallbackNextStepIndex(kBadStepIndex),
756       mLastStepSyncFd(-1) {
757     if (totalSizeOfTemporaries == 0) {
758         return;
759     }
760     int n;
761     std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
762     if (n != ANEURALNETWORKS_NO_ERROR) {
763         LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
764         mNextStepIndex = kBadStepIndex;
765     }
766     for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
767         memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex],
768                location.buffer, location.length);
769     }
770 }
771 
772 // Attempt to create a burst object for each PreparedModel/Partition. If the
773 // burst controller object cannot be made, return a nullptr in its place to
774 // indicate the regular execution path should be used. This can occur either
775 // because PreparedModel was nullptr (cpu was best choice), or because the
776 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts(int preference) const777 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
778         int preference) const {
779     switch (mState) {
780         // burst object for each partition in the compound case
781         case COMPOUND: {
782             std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
783             bursts.reserve(compound()->mSteps.size());
784             for (const auto& logicalStep : compound()->mSteps) {
785                 if (!logicalStep->isExecution()) {
786                     bursts.push_back(nullptr);
787                     continue;
788                 }
789                 if (const auto preparedModel =
790                             logicalStep->executionStep()->getPreparedStepModel()) {
791                     const bool preferPowerOverLatency =
792                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
793                     bursts.push_back(
794                             preparedModel->configureExecutionBurst(preferPowerOverLatency));
795                 } else {
796                     bursts.push_back(nullptr);
797                 }
798             }
799             return bursts;
800         }
801         // single burst object for the simple case
802         case SIMPLE: {
803             std::vector<std::shared_ptr<ExecutionBurstController>> burst;
804             auto simpleBody = simple();
805             if (const auto preparedModel = simpleBody->mPreparedModel) {
806                 const bool preferPowerOverLatency =
807                         (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
808                 burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
809             } else {
810                 burst.push_back(nullptr);
811             }
812             return burst;
813         }
814         // no burst objects made
815         default:
816             return {};
817     }
818 }
819 
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const820 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
821         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
822     CHECK(isValid());
823     if (mState == SIMPLE) {
824         return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
825     }
826     // Create the layout for a Memory object big enough to hold
827     // - every partition boundary TEMPORARY operand and
828     // - buffers required by the control flow implementation.
829     //
830     // TODO: Rethink this approach for managing temporaries.  Some
831     // alternatives:
832     //
833     // 1) Adopt a memory layout scheme analogous to stack allocation,
834     // where objects of non-overlapping lifetime can occupy the same
835     // storage.  We would still have a single Memory object in this
836     // case.
837     //
838     // 2) Do something like what CpuExecutor does, and do allocations
839     // and deallocations on the fly (during execution) before first
840     // reference and after last reference, respectively.  This would
841     // mean having one Memory object per TEMPORARY; or, in a more
842     // complicated implementation, one Memory object per set of
843     // temporaries that have the same lifetime.  Note that the Android
844     // system limits the number of shared memory objects, which are
845     // what our Memory objects represent.
846     //
847     uint32_t totalSizeOfTemporaries = 0;
848     auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
849         totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
850         const uint32_t offset = totalSizeOfTemporaries;
851         totalSizeOfTemporaries += size;
852         return offset;
853     };
854     // This function has two modes of operation:
855     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
856     //    TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
857     //    operands, and panic if we see a source operand of another lifetime.
858     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
859     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
860     //    of another lifetime.
861     auto mapTemporary =
862             [executionBuilder, addTemporaryOfSize](
863                     const SourceOperandIndex& sourceOperandIndex,
864                     std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
865                     OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
866                 CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
867                       lifetime == OperandLifeTime::SUBGRAPH_OUTPUT);
868                 const Operand& sourceOperand =
869                         executionBuilder->getSourceOperand(sourceOperandIndex);
870                 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE &&
871                     sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
872                     // See the caller for explanation.
873                     return;
874                 }
875                 CHECK(sourceOperand.lifetime == lifetime);
876                 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
877                 CHECK_NE(size, 0u);
878                 const uint32_t offset = addTemporaryOfSize(size);
879                 auto [_, isNew] =
880                         sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
881                 CHECK(isNew);
882                 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
883                                 << " offset = " << offset;
884             };
885     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
886     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
887     for (const auto& logicalStep : compound()->mSteps) {
888         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
889             // Allocate memory for ExecutionStep temporary outputs that are
890             // inputs to other steps, as determined by
891             // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
892             //
893             // We don't allocate memory for step model output operands with
894             // source operand lifetime SUBGRAPH_OUTPUT because they will be
895             // - managed by the client (main model outputs),
896             // - assigned a location of another operand (when this step model
897             //   output is a branch model output of an IF; see
898             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
899             // - allocated by a WHILE (when this step model output
900             //   is a condition or body model output of a WHILE; see the
901             //   step->bodyOutputOperands and step->condOutputOperand handling
902             //   below).
903             for (const auto& output : step->getTempsAsStepModelOutputs()) {
904                 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
905                              &sourceOperandToOffsetOfTemporary);
906             }
907         } else if (const IfStep* step = logicalStep->tryIfStep()) {
908             // Allocate memory for all temporary outputs of an IfStep because
909             // they are going to be written to by a branch model. We don't
910             // perform unused output operand optimisation for referenced models.
911             //
912             // We don't allocate memory for branch output operands because they
913             // use the same location as the corresponding outer output operands,
914             // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
915             //
916             // We don't allocate memory for outer output operands with source
917             // operand lifetime SUBGRAPH_OUTPUT because they will be
918             // - managed by the client (main model outputs),
919             // - assigned a location of another operand (when this IF outer
920             //   output is a branch model output of another IF; see
921             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
922             // - allocated by a WHILE (when this IF outer output
923             //   is a condition or body model output of a WHILE; see the
924             //   step->bodyOutputOperands and step->condOutputOperand handling
925             //   below).
926             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
927                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
928             }
929         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
930             // Allocate memory for all temporary outputs of an WhileStep because
931             // they are going to be written to by the WHILE loop.
932             //
933             // We don't allocate memory for outer output operands with source
934             // operand lifetime SUBGRAPH_OUTPUT because they will be
935             // - managed by the client (main model outputs),
936             // - assigned a location of another operand (when this WHILE outer
937             //   output is a branch model output of an IF; see
938             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
939             // - allocated by another WHILE (when this WHILE outer output
940             //   is a condition or body model output of another WHILE; see the
941             //   step->bodyOutputOperands and step->condOutputOperand handling
942             //   below).
943             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
944                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
945             }
946             // Allocate memory for body model outputs. Note that we could use
947             // the outer output operand memory instead but we currently don't do
948             // so (b/148206073).
949             for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
950                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary,
951                              OperandLifeTime::SUBGRAPH_OUTPUT);
952                 // Allocate another set of temporaries for double buffering.
953                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2,
954                              OperandLifeTime::SUBGRAPH_OUTPUT);
955             }
956             // Allocate memory for condition model output.
957             // TODO: Share one condition output memory region between all loops.
958             mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary,
959                          OperandLifeTime::SUBGRAPH_OUTPUT);
960         } else {
961             CHECK(logicalStep->isGoto());
962         }
963     }
964     // Allocate temporary memory for boundary CONSTANT_COPY operands.
965     for (const auto& [sourceOperandIndex, location] :
966          compound()->mSourceOperandToBoundaryConstantCopy) {
967         const uint32_t offset = addTemporaryOfSize(location.length);
968         sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
969         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
970                         << " offset = " << offset;
971     }
972     return std::shared_ptr<Controller>(new Controller(
973             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
974             std::move(sourceOperandToOffsetOfTemporary),
975             std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
976             compound()->mSourceOperandToOutputIndex,
977             compound()->mSourceOperandToBoundaryConstantCopy,
978             compound()->mSourceOperandToBoundaryConstantReference));
979 }
980 
981 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const982 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
983                             std::shared_ptr<StepExecutor>* executor) const {
984     *executor = nullptr;
985 
986     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
987                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
988 
989     if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
990         // We haven't called next().
991         return ANEURALNETWORKS_OP_FAILED;
992     }
993 
994     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
995         // The last call to next() did not produce an executor.
996         return ANEURALNETWORKS_OP_FAILED;
997     }
998 
999     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1000     return next(controller, executor);
1001 }
1002 
Buffer(void * pointer,uint32_t size)1003 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1004     : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)),
1005       mOffset(0) {}
1006 
Buffer(RunTimePoolInfo info,uint32_t offset)1007 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1008     : mInfo(std::move(info)), mOffset(offset) {}
1009 
getPointer() const1010 void* ExecutionPlan::Buffer::getPointer() const {
1011     return mInfo.getBuffer() + mOffset;
1012 }
1013 
getSize() const1014 uint32_t ExecutionPlan::Buffer::getSize() const {
1015     return mInfo.getSize() - mOffset;
1016 }
1017 
flush() const1018 void ExecutionPlan::Buffer::flush() const {
1019     mInfo.flush();
1020 }
1021 
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1022 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1023         const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1024     switch (info.state()) {
1025         case ModelArgumentInfo::POINTER: {
1026             return Buffer(info.buffer(), info.length());
1027         } break;
1028         case ModelArgumentInfo::MEMORY: {
1029             if (std::optional<RunTimePoolInfo> poolInfo =
1030                         executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1031                 return Buffer(*poolInfo, info.locationAndLength().offset);
1032             } else {
1033                 LOG(ERROR) << "Unable to map operand memory pool";
1034                 return std::nullopt;
1035             }
1036         } break;
1037         case ModelArgumentInfo::HAS_NO_VALUE: {
1038             LOG(ERROR) << "Attempting to read an operand that has no value";
1039             return std::nullopt;
1040         } break;
1041         default: {
1042             LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1043             return std::nullopt;
1044         } break;
1045     }
1046 }
1047 
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1048 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1049         std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1050     const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary;
1051     const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1052     const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1053     const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1054     if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex);
1055         it != sourceOperandToOffsetOfTemporary.end()) {
1056         const uint32_t offset = it->second;
1057         const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1058         return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1059     } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1060                it != sourceOperandToInputIndex.end()) {
1061         const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1062         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1063     } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1064                it != sourceOperandToOutputIndex.end()) {
1065         const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1066         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1067     } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1068                it != sourceOperandToConstantReference.end()) {
1069         const ConstantReferenceLocation& location = it->second;
1070         const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1071         if (info == std::nullopt) {
1072             return std::nullopt;
1073         }
1074         return Buffer(info->getBuffer() + location.offset, location.length);
1075     }
1076     return std::nullopt;
1077 }
1078 
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1079 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1080                                       SourceOperandIndex operandIndex, bool* value) const {
1081     std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1082     if (buffer == std::nullopt) {
1083         LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1084         return ANEURALNETWORKS_OP_FAILED;
1085     }
1086     CHECK_GE(buffer->getSize(), sizeof(bool8));
1087     bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1088     *value = static_cast<bool>(value8);
1089     VLOG(EXECUTION) << "readConditionValue: " << *value;
1090     return ANEURALNETWORKS_NO_ERROR;
1091 }
1092 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController,int syncFdOfLastStep) const1093 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1094                         std::shared_ptr<StepExecutor>* executor,
1095                         std::shared_ptr<ExecutionBurstController>* burstController,
1096                         int syncFdOfLastStep) const {
1097     controller->mLastStepSyncFd = syncFdOfLastStep;
1098     *executor = nullptr;
1099     if (burstController != nullptr) {
1100         *burstController = nullptr;
1101     }
1102 
1103     VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1104                     << "): mNextStepIndex = " << controller->mNextStepIndex;
1105 
1106     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1107         return ANEURALNETWORKS_OP_FAILED;
1108     }
1109 
1110     if (mState == EMPTY) {
1111         CHECK_EQ(controller->mNextStepIndex, 0u);  // end
1112         controller->mNextStepIndex = Controller::kBadStepIndex;
1113         return ANEURALNETWORKS_NO_ERROR;
1114     }
1115 
1116     if (mState == SIMPLE) {
1117         if (controller->mNextStepIndex == 0) {
1118             // First (and only) step.
1119             auto simpleBody = simple();
1120             *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
1121                                                        simpleBody->mModel, simpleBody->mDevice,
1122                                                        simpleBody->mPreparedModel);
1123             (*executor)->mapInputsAndOutputsTrivially();
1124             if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1125                 *burstController = controller->mBurstBuilder->getControllerAt(0);
1126             }
1127             controller->mFallbackNextStepIndex = 0;
1128             controller->mNextStepIndex = 1;
1129             return ANEURALNETWORKS_NO_ERROR;
1130         }
1131 
1132         CHECK_EQ(controller->mNextStepIndex, 1u);  // end
1133         controller->mNextStepIndex = Controller::kBadStepIndex;
1134         return ANEURALNETWORKS_NO_ERROR;
1135     }
1136 
1137     return nextCompound(controller, executor, burstController);
1138 }
1139 
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1140 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1141                                 std::shared_ptr<StepExecutor>* executor,
1142                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1143     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1144         return ANEURALNETWORKS_OP_FAILED;
1145     }
1146 
1147     auto compoundBody = compound();
1148     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1149         controller->mNextStepIndex = Controller::kBadStepIndex;  // end
1150         return ANEURALNETWORKS_NO_ERROR;
1151     }
1152 
1153     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1154     if (const IfStep* step = logicalStep->tryIfStep()) {
1155         return nextCompound(step, controller, executor, burstController);
1156     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1157         return nextCompound(step, controller, executor, burstController);
1158     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1159         return nextCompound(step, controller, executor, burstController);
1160     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1161         return nextCompound(step, controller, executor, burstController);
1162     } else {
1163         CHECK(false) << "Unknown step variant";
1164         return ANEURALNETWORKS_BAD_STATE;
1165     }
1166 }
1167 
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1168 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1169                                 std::shared_ptr<StepExecutor>* executor,
1170                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1171     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1172                     << step->getDevice()->getName();
1173     *executor =
1174             std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1175                                            step->getDevice(), step->getPreparedStepModel(), step);
1176     step->mapInputsAndOutputs(
1177             *executor, controller->mTemporaries.get(),
1178             controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
1179             controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
1180     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1181         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1182     }
1183 
1184     controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1185     controller->mNextStepIndex++;
1186     return ANEURALNETWORKS_NO_ERROR;
1187 }
1188 
1189 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1190 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1191                                          const SourceOperandIndex& innerOperand) {
1192     VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1193                     << toString(outerOperand);
1194 #ifdef NN_DEBUGGABLE
1195     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1196                      mSourceOperandToInputIndex.count(innerOperand) +
1197                      mSourceOperandToOutputIndex.count(innerOperand) +
1198                      mSourceOperandToConstantReference.count(innerOperand),
1199              1u);
1200 #endif
1201     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1202     mSourceOperandToInputIndex.erase(innerOperand);
1203     mSourceOperandToOutputIndex.erase(innerOperand);
1204     mSourceOperandToConstantReference.erase(innerOperand);
1205     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1206         it != mSourceOperandToOffsetOfTemporary.end()) {
1207         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1208     } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1209                it != mSourceOperandToInputIndex.end()) {
1210         mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1211     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1212                it != mSourceOperandToOutputIndex.end()) {
1213         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1214     } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1215                it != mSourceOperandToConstantReference.end()) {
1216         mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1217     } else {
1218         CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1219                      << " from operand " << toString(outerOperand);
1220     }
1221 }
1222 
1223 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1224 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1225                                           const SourceOperandIndex& innerOperand) {
1226     VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1227                     << toString(outerOperand);
1228 #ifdef NN_DEBUGGABLE
1229     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1230                      mSourceOperandToOutputIndex.count(innerOperand),
1231              1u);
1232 #endif
1233     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1234     mSourceOperandToOutputIndex.erase(innerOperand);
1235     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1236         it != mSourceOperandToOffsetOfTemporary.end()) {
1237         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1238     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1239                it != mSourceOperandToOutputIndex.end()) {
1240         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1241     } else {
1242         CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1243                      << " from operand " << toString(outerOperand);
1244     }
1245 }
1246 
waitForLastStepSyncFence() const1247 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1248     if (mLastStepSyncFd == -1) {
1249         return ANEURALNETWORKS_NO_ERROR;
1250     }
1251     VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1252     auto r = syncWait(mLastStepSyncFd, -1);
1253     int n = ANEURALNETWORKS_NO_ERROR;
1254     if (r != FenceState::SIGNALED) {
1255         LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1256         n = ANEURALNETWORKS_OP_FAILED;
1257     }
1258     return n;
1259 }
1260 
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1261 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1262                                 std::shared_ptr<StepExecutor>* executor,
1263                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1264     VLOG(EXECUTION) << "next: " << toString(*step);
1265     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1266     // This is safe because the steps are serialized when doing fenced compute.
1267     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1268     bool condValue;
1269     NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1270     controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1271     const std::vector<SourceOperandIndex>& branchInputOperands =
1272             condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1273     const std::vector<SourceOperandIndex>& branchOutputOperands =
1274             condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1275     CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1276     CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1277     for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1278         // We have to do this assignment just before executing this step to
1279         // accommodate cases when the IF resides within a WHILE condition or
1280         // body model and for some j the i-th input of the IF branch model is
1281         // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1282         // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1283         // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1284         // In such cases, the WhileStep modifies the location of
1285         // step->outerInputOperands[i] to implement double buffering.
1286         controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1287     }
1288     for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1289         // We have to do this assignment just before executing this step to
1290         // accommodate the case when the IF resides within a WHILE body
1291         // model and the i-th output of the IF branch model is an
1292         // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1293         // some j). In that case, the WhileStep modifies the location of
1294         // step->outerOutputOperands[i] to implement double buffering.
1295         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1296     }
1297     return nextCompound(controller, executor, burstController);
1298 }
1299 
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1300 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1301                                 std::shared_ptr<StepExecutor>* executor,
1302                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1303     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1304     if (state.stage == WhileState::EVALUATE_CONDITION) {
1305         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1306         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1307                         << ": evaluating condition";
1308         controller->mNextStepIndex = step->condStepIndex;
1309 
1310         if (state.iteration == 0) {
1311             state.startTime = std::chrono::steady_clock::now();
1312         }
1313 
1314         // iteration = 0   cond inputs = outer inputs
1315         // iteration = 1   cond inputs = body outputs
1316         // iteration = 2   cond inputs = body outputs
1317         // iteration = 3   cond inputs = ...
1318         uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1319         CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1320         CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1321         for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1322             bool operandIsInputOnly = i >= loopBodyOutputCount;
1323             controller->setInput((state.iteration == 0 || operandIsInputOnly)
1324                                          ? step->outerInputOperands[i]
1325                                          : step->bodyOutputOperands[i],
1326                                  step->condInputOperands[i]);
1327         }
1328 
1329         state.stage = WhileState::EVALUATE_BODY;
1330         return nextCompound(controller, executor, burstController);
1331     }
1332 
1333     CHECK(state.stage == WhileState::EVALUATE_BODY);
1334     std::chrono::nanoseconds timeoutDuration(
1335             controller->mExecutionBuilder->getLoopTimeoutDuration());
1336     auto duration = std::chrono::steady_clock::now() - state.startTime;
1337     if (duration > timeoutDuration) {
1338         LOG(ERROR) << "WHILE loop timed out after "
1339                    << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1340                    << " ms";
1341         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1342     }
1343 
1344     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1345     // This is safe because the steps are serialized when doing fenced compute.
1346     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1347     bool condValue;
1348     NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1349     if (condValue) {
1350         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1351                         << ": evaluating body";
1352         controller->mNextStepIndex = step->bodyStepIndex;
1353 
1354         // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
1355         // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
1356         // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
1357         // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
1358 #ifdef NN_DEBUGGABLE
1359         CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1360         CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1361         CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1362         CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1363 #endif
1364         for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1365             controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1366         }
1367         if (state.iteration != 0) {
1368             for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1369 #ifdef NN_DEBUGGABLE
1370                 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1371                 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1372                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u);
1373                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u);
1374 #endif
1375                 std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand],
1376                           controller->mSourceOperandToOffsetOfTemporary2[outputOperand]);
1377             }
1378         }
1379     } else {
1380         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1381                         << ": exiting loop";
1382         controller->mNextStepIndex = step->exitStepIndex;
1383 
1384         // Copy body outputs to outer outputs.
1385         // TODO: Use outer outputs instead of tmp2 to avoid copying?
1386         CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1387         for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1388             // condInputOperands[i] points to a body output operand from the
1389             // last iteration if we've executed at least one iteration and to a
1390             // WHILE operation input operand otherwise.
1391             const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1392             const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1393             std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1394             if (outerBuffer == std::nullopt) {
1395                 // This should never happen.
1396                 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1397                 return ANEURALNETWORKS_OP_FAILED;
1398             }
1399             const Operand& sourceOperand =
1400                     controller->mExecutionBuilder->getSourceOperand(outerOperand);
1401             const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1402             CHECK_NE(size, 0u);
1403             std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1404             if (innerBuffer == std::nullopt) {
1405                 // This should never happen.
1406                 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1407                 return ANEURALNETWORKS_OP_FAILED;
1408             }
1409             CHECK_LE(size, innerBuffer->getSize());
1410             CHECK_LE(size, outerBuffer->getSize());
1411             memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1412             outerBuffer->flush();
1413         }
1414         state.iteration = WhileState::kOutsideLoop;
1415     }
1416 
1417     state.stage = WhileState::EVALUATE_CONDITION;
1418     return nextCompound(controller, executor, burstController);
1419 }
1420 
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1421 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1422                                 std::shared_ptr<StepExecutor>* executor,
1423                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1424     VLOG(EXECUTION) << "next: " << toString(*step);
1425     controller->mNextStepIndex = step->gotoStepIndex;
1426     return nextCompound(controller, executor, burstController);
1427 }
1428 
becomeCompoundIfEmpty()1429 void ExecutionPlan::becomeCompoundIfEmpty() {
1430     CHECK(mState != SIMPLE);
1431     if (mState == EMPTY) {
1432         mBody = new CompoundBody();
1433         mState = COMPOUND;
1434     }
1435 }
1436 
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1437 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1438                                                      const std::shared_ptr<Device> device) {
1439     becomeCompoundIfEmpty();
1440     auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1441                                               compound()->mSteps.size(), sourceModelIndex, device);
1442     compound()->mSteps.push_back(step);
1443     return step->executionStep();
1444 }
1445 
createNewIfStep()1446 IfStep* ExecutionPlan::createNewIfStep() {
1447     becomeCompoundIfEmpty();
1448     auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1449     step->ifStep()->index = compound()->mSteps.size();
1450     compound()->mSteps.push_back(step);
1451     return step->ifStep();
1452 }
1453 
createNewWhileStep()1454 WhileStep* ExecutionPlan::createNewWhileStep() {
1455     becomeCompoundIfEmpty();
1456     auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1457     step->whileStep()->index = compound()->mSteps.size();
1458     compound()->mSteps.push_back(step);
1459     return step->whileStep();
1460 }
1461 
createNewGotoStep()1462 GotoStep* ExecutionPlan::createNewGotoStep() {
1463     becomeCompoundIfEmpty();
1464     auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1465     step->gotoStep()->index = compound()->mSteps.size();
1466     compound()->mSteps.push_back(step);
1467     return step->gotoStep();
1468 }
1469 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1470 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1471                                      const ModelBuilder* model) {
1472     CHECK(mState == EMPTY);
1473     mBody = new SimpleBody(device, model, mCacheDir, mToken);
1474     mState = SIMPLE;
1475 }
1476 
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1477 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1478     auto [it, isNew] =
1479             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1480     CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1481                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1482 }
1483 
dump() const1484 void ExecutionPlan::dump() const {
1485     if (mBody) {
1486         mBody->dump();
1487     } else {
1488         VLOG(COMPILATION) << "EMPTY";
1489     }
1490 }
1491 
reset()1492 void ExecutionPlan::reset() {
1493     if (mBody) {
1494         delete mBody;
1495         mBody = nullptr;
1496     }
1497     mState = EMPTY;
1498 }
1499 
isSimpleCpu() const1500 bool ExecutionPlan::isSimpleCpu() const {
1501     return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1502 }
1503 
forTest_getKind() const1504 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1505     switch (mState) {
1506         case EMPTY:
1507             return Kind::EMPTY;
1508         case SIMPLE:
1509             nnAssert(mBody);
1510             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1511         case COMPOUND:
1512             nnAssert(mBody);
1513             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1514         default:
1515             nnAssert(!"unexpected state");
1516             return Kind::ERROR;
1517     }
1518 }
1519 
forTest_simpleGetDevice() const1520 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1521     return simple()->mDevice;
1522 }
1523 
forTest_compoundGetSteps() const1524 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1525     return compound()->mSteps;
1526 }
1527 
forTest_hasStepModelOutputsOfUnknownSize() const1528 bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
1529     return mBody->hasStepModelOutputsOfUnknownSize();
1530 }
1531 
forTest_simpleGetCacheToken() const1532 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1533     return simple()->mToken.getCacheToken();
1534 }
1535 
dump() const1536 void ExecutionPlan::SimpleBody::dump() const {
1537     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1538 }
1539 
dump() const1540 void ExecutionPlan::CompoundBody::dump() const {
1541     for (const auto& step : mSteps) {
1542         step->dump();
1543     }
1544 }
1545 
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1546 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
1547                                                        const StepRoleCallback& callback) const {
1548     callback(mPreparedModel.get(), IOType::INPUT, index);
1549 }
1550 
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1551 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
1552                                                         const StepRoleCallback& callback) const {
1553     callback(mPreparedModel.get(), IOType::OUTPUT, index);
1554 }
1555 
1556 // Map an input role of the main model to the input/output roles in the step models:
1557 // - An input role of the main model may be used as an input of multiple step models.
1558 // - An input role of the main model should not be used as an output of any step model.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1559 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
1560                                                          const StepRoleCallback& callback) const {
1561     for (const auto& logicalStep : mSteps) {
1562         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1563             // Model input as step model input.
1564             const auto& inputMapping = step->getInputIndexStepModelToMainModel();
1565             for (uint32_t i = 0; i < inputMapping.size(); i++) {
1566                 if (inputMapping[i] == index) {
1567                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1568                 }
1569             }
1570         }
1571     }
1572 }
1573 
1574 // Map an output role of the main model to the input/output roles in the step models:
1575 // - An output role of the main model may only be used as one output of one single step model.
1576 // - An output role of the main model may be used as an input of multiple step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1577 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
1578                                                           const StepRoleCallback& callback) const {
1579     bool found = false;
1580     for (const auto& logicalStep : mSteps) {
1581         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1582             // Model output as step model output.
1583             if (!found) {
1584                 const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
1585                 for (uint32_t i = 0; i < outputMapping.size(); i++) {
1586                     if (outputMapping[i] == index) {
1587                         callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
1588                         found = true;
1589                         break;
1590                     }
1591                 }
1592             }
1593             // Model output as step model input.
1594             const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
1595             for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
1596                 if (inputToOutputMapping[i] == index) {
1597                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1598                 }
1599             }
1600         }
1601     }
1602 }
1603 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1604 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
1605                                    uint32_t preference, uint32_t priority,
1606                                    const std::optional<Deadline>& deadline,
1607                                    ExecutionPlan* plan) const {
1608     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
1609     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
1610                                                 deadline, plan));
1611     int n = plan->finish(preference, priority, deadline);
1612     if (VLOG_IS_ON(COMPILATION)) {
1613         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
1614         logModelToInfo(makeHidlModel());
1615         plan->dump();
1616     }
1617     return n;
1618 }
1619 
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1620 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
1621                                            const std::vector<std::shared_ptr<Device>>& devices,
1622                                            uint32_t preference, uint32_t priority,
1623                                            const std::optional<Deadline>& deadline,
1624                                            ExecutionPlan* plan) const {
1625     // This function uses a heuristic approach to partitioning the graph.
1626     // It should be good enough for the first release.
1627 
1628     SourceModels* sourceModels = &plan->getSourceModels();
1629     const size_t deviceCount = devices.size();
1630     const size_t operationCount = mOperations.size();
1631 
1632     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
1633                       << "sourceModelIndex = " << sourceModelIndex << ", "
1634                       << "deviceCount = " << deviceCount << ", "
1635                       << "operationCount = " << operationCount;
1636 
1637     // Figure out where each operation will best execute.
1638     // The value of the vector is the index in the devices vector.
1639     std::vector<int> bestDeviceForOperation(operationCount);
1640     NN_RETURN_IF_ERROR(
1641             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
1642 
1643     // A special value produced by findBestDeviceForEachOperation meaning that
1644     // this is a control flow operation scheduled for interpreted execution
1645     // (see LogicalStep).
1646     const int kControlFlowInterpreter = deviceCount;
1647 
1648     // If one device will run all the operations, we don't need to split the
1649     // work. This shortcut does not apply when recursively partitioning
1650     // referenced models because our plan representation is flat.
1651     if (sourceModelIndex == kMainModelInSourceModels &&
1652         std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
1653                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
1654         const int bestDeviceIndex = bestDeviceForOperation[0];
1655         // Bypass the partitioning process unless the only operation is a
1656         // control flow operation scheduled for interpreted execution.
1657         if (bestDeviceIndex != kControlFlowInterpreter) {
1658             VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
1659                               << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
1660             plan->becomeSingleStep(devices[bestDeviceIndex], this);
1661             return ANEURALNETWORKS_NO_ERROR;
1662         }
1663     }
1664 
1665     // No easy solution, we need to split the work.
1666 
1667     // We keep track of the operations that are ready to run for each device.
1668     // perDeviceQueue[deviceCount] is for interpreted execution of control flow
1669     // (see LogicalStep).
1670     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
1671 
1672     // This helper function enqueues the operation on the appropriate queue.
1673     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
1674         int deviceIndex = bestDeviceForOperation[operationIndex];
1675         perDeviceQueue[deviceIndex].push(operationIndex);
1676         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1677                           << deviceIndex;
1678     };
1679 
1680     // This helper function finds a device that has operations ready to process.
1681     // We start by looking at the control flow queue, and then look at the
1682     // devices in reverse order (i.e., starting at the end of the devices
1683     // vector). Earlier devices have a chance to prepare more of the inputs
1684     // required by other devices. This function returns -1 if all queues are
1685     // empty.
1686     auto findNextDeviceToProcess = [&]() -> int {
1687         for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
1688             if (!perDeviceQueue[i].empty()) {
1689                 return i;
1690             }
1691         }
1692         return -1;
1693     };
1694 
1695     OperandTracker tracker(this, enqueueOnAppropriateDevice);
1696     // For each iteration of this loop, we'll create an execution step.
1697     while (true) {
1698         // Find the device we'll do this step for.
1699         int deviceIndex = findNextDeviceToProcess();
1700         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1701         if (deviceIndex < 0) {
1702             break;
1703         }
1704 
1705         // Assign as much as possible to this device.
1706         auto& queue = perDeviceQueue[deviceIndex];
1707         if (deviceIndex != kControlFlowInterpreter) {
1708             ExecutionStep* step =
1709                     plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
1710             while (!queue.empty()) {
1711                 uint32_t operationIndex = queue.front();
1712                 queue.pop();
1713                 int n = step->addOperation(operationIndex);
1714                 if (n != ANEURALNETWORKS_NO_ERROR) {
1715                     LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1716                     return n;
1717                 }
1718                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1719             }
1720         } else {
1721             while (!queue.empty()) {
1722                 uint32_t operationIndex = queue.front();
1723                 queue.pop();
1724                 const Operation& operation = getOperation(operationIndex);
1725                 if (operation.type == OperationType::IF) {
1726                     namespace op = operation_if;
1727                     const Operand& thenOperand =
1728                             getOperand(operation.inputs[op::kThenModelOperand]);
1729                     const Operand& elseOperand =
1730                             getOperand(operation.inputs[op::kElseModelOperand]);
1731                     const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1732                     const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1733                     uint32_t thenModelIndex = sourceModels->addModel(thenModel);
1734                     uint32_t elseModelIndex = sourceModels->addModel(elseModel);
1735 
1736                     // Emits the following:
1737                     // Index  Step
1738                     //   i    if then=(i + 1) else=(j + 1)
1739                     //  ...   (then model steps)
1740                     //   j    goto k
1741                     //  ...   (else model steps)
1742                     //   k    (steps after the IF)
1743                     IfStep* ifStep = plan->createNewIfStep();
1744                     ifStep->conditionOperandIndex = SourceOperandIndex(
1745                             sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
1746                     ifStep->thenStepIndex = plan->getNextStepIndex();
1747                     NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
1748                             thenModelIndex, devices, preference, priority, deadline, plan));
1749                     GotoStep* afterThenBranch = plan->createNewGotoStep();
1750                     ifStep->elseStepIndex = plan->getNextStepIndex();
1751                     NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
1752                             elseModelIndex, devices, preference, priority, deadline, plan));
1753                     afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
1754 
1755                     // Outer model operands.
1756                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1757                         ifStep->outerInputOperands.emplace_back(sourceModelIndex,
1758                                                                 operation.inputs[i]);
1759                     }
1760                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1761                         ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
1762                                                                  operation.outputs[i]);
1763                     }
1764                     // Then model operands.
1765                     for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
1766                         ifStep->thenBranchInputOperands.emplace_back(
1767                                 thenModelIndex, thenModel->getInputOperandIndex(i));
1768                     }
1769                     for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
1770                         ifStep->thenBranchOutputOperands.emplace_back(
1771                                 thenModelIndex, thenModel->getOutputOperandIndex(i));
1772                     }
1773                     // Else model operands.
1774                     for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
1775                         ifStep->elseBranchInputOperands.emplace_back(
1776                                 elseModelIndex, elseModel->getInputOperandIndex(i));
1777                     }
1778                     for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
1779                         ifStep->elseBranchOutputOperands.emplace_back(
1780                                 elseModelIndex, elseModel->getOutputOperandIndex(i));
1781                     }
1782                 } else if (operation.type == OperationType::WHILE) {
1783                     namespace op = operation_while;
1784                     const Operand& condOperand =
1785                             getOperand(operation.inputs[op::kCondModelOperand]);
1786                     const Operand& bodyOperand =
1787                             getOperand(operation.inputs[op::kBodyModelOperand]);
1788                     const ModelBuilder* condModel = getReferencedModel(condOperand);
1789                     const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1790                     uint32_t condModelIndex = sourceModels->addModel(condModel);
1791                     uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
1792 
1793                     // Emits the following:
1794                     // Index  Step
1795                     //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
1796                     //  ...   (cond model steps)
1797                     //   j    goto i
1798                     //  ...   (body model steps)
1799                     //   k    goto i
1800                     //  ...   (steps after the WHILE)
1801                     //
1802                     //  Note that WhileStep has WhileState associated with it.
1803                     WhileStep* whileStep = plan->createNewWhileStep();
1804                     whileStep->condStepIndex = plan->getNextStepIndex();
1805                     NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
1806                             condModelIndex, devices, preference, priority, deadline, plan));
1807                     GotoStep* afterCond = plan->createNewGotoStep();
1808                     afterCond->gotoStepIndex = whileStep->index;
1809                     whileStep->bodyStepIndex = plan->getNextStepIndex();
1810                     NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
1811                             bodyModelIndex, devices, preference, priority, deadline, plan));
1812                     GotoStep* afterBody = plan->createNewGotoStep();
1813                     afterBody->gotoStepIndex = whileStep->index;
1814                     whileStep->exitStepIndex = plan->getNextStepIndex();
1815 
1816                     // Outer model operands.
1817                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1818                         whileStep->outerInputOperands.emplace_back(sourceModelIndex,
1819                                                                    operation.inputs[i]);
1820                     }
1821                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1822                         whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
1823                                                                     operation.outputs[i]);
1824                     }
1825                     // Cond model operands.
1826                     for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
1827                         whileStep->condInputOperands.emplace_back(
1828                                 condModelIndex, condModel->getInputOperandIndex(i));
1829                     }
1830                     whileStep->condOutputOperand =
1831                             SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
1832                     // Body model operands.
1833                     for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
1834                         whileStep->bodyInputOperands.emplace_back(
1835                                 bodyModelIndex, bodyModel->getInputOperandIndex(i));
1836                     }
1837                     for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
1838                         whileStep->bodyOutputOperands.emplace_back(
1839                                 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
1840                     }
1841                 } else {
1842                     CHECK(false) << toString(operation.type) << " is not a control flow operation";
1843                 }
1844                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1845             }
1846         }
1847     }
1848     return ANEURALNETWORKS_NO_ERROR;
1849 }
1850 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const1851 float ModelBuilder::getPerformance(uint32_t preference,
1852                                    const std::shared_ptr<Device> device) const {
1853     // Note that we will call this method multiple times per compilation with
1854     // the same arguments if there are nested control flow operations and we
1855     // decide to execute the outer operation on the ExecutionPlan::next()
1856     // interpreter.
1857     //
1858     // This is a potential compilation performance problem. To work around it,
1859     // the performance value could be cached for the duration of a compilation.
1860     float perf = 0;
1861     const size_t operationCount = mOperations.size();
1862     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1863         perf += getPerformance(preference, device, operationIndex);
1864     }
1865     return perf;
1866 }
1867 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const1868 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
1869                                    uint32_t operationIndex) const {
1870     auto applyPreference = [preference](const PerformanceInfo& perf) {
1871         return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
1872     };
1873 
1874     const Operation& operation = getOperation(operationIndex);
1875 
1876     if (operation.type == OperationType::IF) {
1877         namespace op = operation_if;
1878         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1879         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1880         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1881         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1882         return applyPreference(device->getIfPerformance()) +
1883                0.5 * (thenModel->getPerformance(preference, device) +
1884                       elseModel->getPerformance(preference, device));
1885     }
1886 
1887     if (operation.type == OperationType::WHILE) {
1888         namespace op = operation_while;
1889         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1890         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1891         const ModelBuilder* condModel = getReferencedModel(condOperand);
1892         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1893         return applyPreference(device->getWhilePerformance()) +
1894                condModel->getPerformance(preference, device) +
1895                bodyModel->getPerformance(preference, device);
1896     }
1897 
1898     // TODO This assumes that the type is dictated by the first operand. This is
1899     // currently the case but is not a safe assumption to make in the long term.
1900     const uint32_t operandIndex = operation.inputs[0];
1901     const OperandType operandType = mOperands[operandIndex].type;
1902     switch (operandType) {
1903         case OperandType::FLOAT32:
1904             if (mRelaxComputationFloat32toFloat16) {
1905                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
1906             }
1907             break;
1908         case OperandType::TENSOR_FLOAT32:
1909             if (mRelaxComputationFloat32toFloat16) {
1910                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
1911             }
1912             break;
1913         default:
1914             break;
1915     }
1916 
1917     return applyPreference(device->getPerformance(operandType));
1918 }
1919 
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const1920 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
1921     auto containsUnknownSize = [](const ModelBuilder* model,
1922                                   const std::vector<uint32_t>& operandIndexes) {
1923         for (uint32_t operandIndex : operandIndexes) {
1924             if (hasUnknownSize(model->getOperand(operandIndex))) {
1925                 return true;
1926             }
1927         }
1928         return false;
1929     };
1930 
1931     const Operation& operation = getOperation(operationIndex);
1932 
1933     if (operation.type == OperationType::IF) {
1934         namespace op = operation_if;
1935         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1936         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1937         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1938         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1939         return containsUnknownSize(this, operation.inputs) ||
1940                containsUnknownSize(this, operation.outputs) ||
1941                containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
1942                containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
1943                containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
1944                containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
1945     }
1946 
1947     if (operation.type == OperationType::WHILE) {
1948         namespace op = operation_while;
1949         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1950         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1951         const ModelBuilder* condModel = getReferencedModel(condOperand);
1952         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1953         return containsUnknownSize(this, operation.inputs) ||
1954                containsUnknownSize(this, operation.outputs) ||
1955                containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
1956                containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
1957                containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
1958                containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
1959     }
1960 
1961     // Not a control flow operation.
1962     return false;
1963 }
1964 
supportedByControlFlowInterpreter(uint32_t operationIndex) const1965 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
1966     const Operation& operation = getOperation(operationIndex);
1967     return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
1968            // The partitioner does not support dynamic temporaries (b/132458982).
1969            !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
1970 }
1971 
1972 namespace {
1973 
1974 // This class determines whether a given device can execute a given operation
1975 class CanDo {
1976    public:
CanDo()1977     CanDo() {}
1978 
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)1979     void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
1980         mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
1981     }
1982 
check(size_t operationIndex) const1983     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1984 
1985    private:
1986     std::vector<bool> mSupportsOperationByIndex;
1987 };
1988 
1989 }  // anonymous namespace
1990 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1991 int ModelBuilder::findBestDeviceForEachOperation(
1992         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1993         std::vector<int>* bestDeviceForOperation) const {
1994     const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());
1995 
1996     const size_t deviceCount = devices.size();
1997     std::vector<CanDo> canDo(deviceCount);
1998     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1999         canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2000     }
2001 
2002     // Figure out the best driver for each operation.
2003     const size_t operationCount = mOperations.size();
2004     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2005         const Operation& operation = getOperation(operationIndex);
2006         // Find which device, including CPU fallback, gives the best performance for this operation.
2007         int bestChoice = -1;
2008 
2009         if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2010             // Do not schedule control flow operations with unknown size to
2011             // non-CPU devices because this is not supported by the 1.3 HAL.
2012             // See http://b/159076604#comment5.
2013             auto cpuDeviceIterator =
2014                     std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2015             if (cpuDeviceIterator != devices.end()) {
2016                 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2017                 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2018                     bestChoice = cpuDeviceIndex;
2019                 }
2020             }
2021         } else {
2022             float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
2023             for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2024                 const auto& device = devices[deviceIndex];
2025                 if (canDo[deviceIndex].check(operationIndex)) {
2026                     const float perfVal = getPerformance(preference, device, operationIndex);
2027                     if (bestChoice < 0 || perfVal < bestPerfVal ||
2028                         (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
2029                         bestChoice = deviceIndex;
2030                         bestPerfVal = perfVal;
2031                     }
2032                 } else {
2033                     // Somewhat noisy logging, but only place where the user of NNAPI can get
2034                     // feedback on why an operation was not run on a specific device.
2035                     //
2036                     // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2037                     // very small.
2038                     VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2039                                       << toString(operation.type);
2040                 }
2041             }
2042         }
2043 
2044         if (bestChoice < 0) {
2045             LOG(ERROR) << "No driver can do operation " << toString(operation.type);
2046             return ANEURALNETWORKS_BAD_DATA;
2047         } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2048                    supportedByControlFlowInterpreter(operationIndex)) {
2049             // Run control flow on the ExecutionPlan::next() interpreter and try
2050             // to delegate referenced models.
2051             const int kControlFlowInterpreter = deviceCount;
2052             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2053             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2054                               << toString(operation.type) << ") = -1"
2055                               << " (NNAPI)";
2056         } else {
2057             (*bestDeviceForOperation)[operationIndex] = bestChoice;
2058             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2059                               << toString(operation.type) << ") = " << bestChoice << " ("
2060                               << devices[bestChoice]->getName() << ")";
2061         }
2062     }
2063     return ANEURALNETWORKS_NO_ERROR;
2064 }
2065 
2066 }  // namespace nn
2067 }  // namespace android
2068