• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <GraphDump.h>
24 #include <LegacyUtils.h>
25 #include <MetaModel.h>
26 #include <OperationsUtils.h>
27 #include <TokenHasher.h>
28 #include <Tracing.h>
29 #include <fcntl.h>
30 #include <nnapi/IBurst.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 
34 #include <algorithm>
35 #include <functional>
36 #include <map>
37 #include <memory>
38 #include <mutex>
39 #include <queue>
40 #include <set>
41 #include <string>
42 #include <type_traits>
43 #include <unordered_set>
44 #include <utility>
45 #include <vector>
46 
47 #include "BurstBuilder.h"
48 #include "CompilationBuilder.h"
49 #include "ExecutionBuilder.h"
50 #include "ExecutionCallback.h"
51 #include "Manager.h"
52 #include "ModelBuilder.h"
53 #include "TypeManager.h"
54 
55 namespace android {
56 namespace nn {
57 
58 namespace {
59 
60 // The index of the main model in SourceModels.
61 constexpr uint32_t kMainModelInSourceModels = 0;
62 
63 constexpr uint32_t kNoPadding = 1;
64 
65 // Compiles the model on device.
66 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
67 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
68 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
69 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,std::shared_ptr<RuntimePreparedModel> * preparedModel)70 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
71             int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
72             TokenHasher* token, std::shared_ptr<RuntimePreparedModel>* preparedModel) {
73     CHECK(token != nullptr);
74     CHECK(preparedModel != nullptr);
75     *preparedModel = nullptr;
76 
77     std::optional<CacheToken> cacheToken;
78     if (device.isCachingSupported() && token->ok() &&
79         token->updateFromString(device.getName().c_str()) &&
80         token->updateFromString(device.getVersionString().c_str()) &&
81         token->update(&executionPreference, sizeof(executionPreference)) &&
82         token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
83         cacheToken = CacheToken{};
84         const uint8_t* tokenPtr = token->getCacheToken();
85         std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin());
86     }
87 
88     const ModelFactory makeModel = [&model] { return model.makeModel(); };
89     const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
90     const Priority priority = convertToCanonicalPriority(compilationPriority);
91     const auto [n, returnedPreparedModel] =
92             device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken);
93     *preparedModel = returnedPreparedModel;
94     return n;
95 }
96 
97 typedef std::function<void(uint32_t)> OperationReadyCallback;
98 
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)99 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
100                            const Operand& fromOperand) {
101     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
102         std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) {
103         auto& fromChannelQuant =
104                 std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams);
105         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
106                 .channelDim = fromChannelQuant.channelDim,
107                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
108                 .scales = fromChannelQuant.scales.data(),
109         };
110         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
111     } else if (isExtension(fromOperand.type) &&
112                std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) {
113         auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams);
114         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
115                                              extensionData.size());
116     } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) ||
117                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
118         LOG(ERROR) << "Type " << fromOperand.type
119                    << " has an unexpected extraParams variant: " << fromOperand.extraParams.index();
120         return ANEURALNETWORKS_BAD_DATA;
121     } else {
122         return ANEURALNETWORKS_NO_ERROR;
123     }
124 }
125 
126 // This class tracks whether we know the value of an operand as operations
127 // are processed.
128 class OperandTracker {
129    public:
130     // Creates the tracker for this model. Figure out which operations can be
131     // executed right away and cb for each one of them.
132     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
133     // Mark the specified operation as having been processed. The output
134     // of the operation now being known, this may make new operations to be
135     // able to run.  Call cb for each one of them.
136     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
137 
138    private:
139     const ModelBuilder* mModel;
140     std::multimap<uint32_t, uint32_t> mOperandToOperations;
141     std::vector<uint32_t> mUnknownInputCount;  // For each operation
142 };
143 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)144 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
145     : mModel(model) {
146     const auto& operations = mModel->getOperations();
147     mUnknownInputCount.resize(operations.size());
148     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
149         const Operation& operation = operations[operationIndex];
150         uint32_t count = 0;
151         for (uint32_t operandIndex : operation.inputs) {
152             auto lifetime = mModel->getOperand(operandIndex).lifetime;
153             if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
154                 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
155                 count++;
156                 mOperandToOperations.emplace(operandIndex, operationIndex);
157             }
158         }
159         if (count == 0) {
160             cb(operationIndex);
161         }
162         mUnknownInputCount[operationIndex] = count;
163     }
164 }
165 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)166 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
167     // Mark all its outputs as known.
168     const Operation& operation = mModel->getOperations()[operationIndex];
169     for (uint32_t operandIndex : operation.outputs) {
170         auto range = mOperandToOperations.equal_range(operandIndex);
171         for (auto i = range.first; i != range.second; i++) {
172             uint32_t& count = mUnknownInputCount[i->second];
173             if (--count == 0) {
174                 cb(i->second);
175             }
176         }
177     }
178 }
179 
addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)180 StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size,
181                                      uint32_t alignment, uint32_t padding) {
182     // TODO: what about overflow?
183     *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment);
184     const uint32_t offset = *totalSizeOfTemporaries;
185     size = roundUp(size, padding);
186     *totalSizeOfTemporaries += size;
187     return {.offset = offset, .paddedLength = size};
188 };
189 
toString(SourceOperandIndex sourceOperandIndex)190 std::string toString(SourceOperandIndex sourceOperandIndex) {
191     return "(" + std::to_string(sourceOperandIndex.first) + ", " +
192            std::to_string(sourceOperandIndex.second) + ")";
193 };
194 
195 // A helper class to analyze the step roles of all partition boundary operands.
196 //
197 // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer
198 // with the following two methods:
199 //   - addRole: Add a step role to a boundary operand
200 //   - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest"
201 //     operand. All of the step roles of the "dest" operand are also possible step roles of the
202 //     "source" operand. This is useful for interpreted control flow, e.g., the outer input operand
203 //     of an interpreted IF operation may be directly used as all step roles of the corresponding
204 //     input operand of the then and else models. Note that this relationship is directional --
205 //     (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a
206 //     shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph
207 //     produced by the used-by relationship is acyclic. This is true for the partitioner algorithm
208 //     because there must be a root operand of each step role for the memory to be allocated on
209 //     behalf of.
210 //
211 class StepRoleAnalyzer {
212    public:
analyze(const std::function<void (StepRoleAnalyzer &)> & setup)213     static std::map<SourceOperandIndex, std::set<StepRole>> analyze(
214             const std::function<void(StepRoleAnalyzer&)>& setup) {
215         StepRoleAnalyzer analyzer;
216         setup(analyzer);
217         return analyzer.finish();
218     }
219 
addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)220     void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type,
221                  uint32_t stepIOIndex) {
222         SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex};
223         mRoles[source].emplace(step.getIndex(), type, stepIOIndex);
224     }
225 
setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)226     void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) {
227         mUsedBy[source].emplace(dest);
228     }
229 
230    private:
231     StepRoleAnalyzer() = default;
232 
233     // Merges the step roles of the destination operands to the source operands
234     // and returns the final map.
finish()235     std::map<SourceOperandIndex, std::set<StepRole>> finish() {
236         for (const auto& [source, _] : mUsedBy) {
237             finishHelper(source);
238         }
239         return std::move(mRoles);
240     }
241 
finishHelper(SourceOperandIndex current)242     void finishHelper(SourceOperandIndex current) {
243         if (mProcessedOperands.count(current) > 0) return;
244         mProcessedOperands.insert(current);
245         const auto it = mUsedBy.find(current);
246         if (it != mUsedBy.end()) {
247             auto& roles = mRoles[current];
248             // Merge the step roles of the destination operands.
249             for (const auto& dest : it->second) {
250                 finishHelper(dest);
251                 const auto& destRoles = mRoles[dest];
252                 roles.insert(destRoles.begin(), destRoles.end());
253             }
254         }
255     }
256 
257     // A map from the source operand to its step roles.
258     std::map<SourceOperandIndex, std::set<StepRole>> mRoles;
259     // A map from the source operand to a set of destination operands that may directly
260     // use the memory of the source operand.
261     std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy;
262     // Used in finish to track which operand has been processed.
263     std::set<SourceOperandIndex> mProcessedOperands;
264 };
265 
266 }  // namespace
267 
vlogDump(const char * context) const268 void DynamicTemporaries::vlogDump(const char* context) const {
269     if (empty()) {
270         return;
271     }
272     if (context) {
273         VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
274     }
275     for (const auto& temp : mSourceOperandToTemporary) {
276         VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
277                         << ", stepIndex = " << temp.second.stepIndex
278                         << ", offset = " << temp.second.offset
279                         << ", dimensions = " << toString(temp.second.dimensions)
280                         << ", paddedLength = " << temp.second.paddedLength
281                         << ", alignment = " << temp.second.alignment
282                         << ", padding = " << temp.second.padding;
283     }
284 }
285 
declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)286 void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
287                                  const Dimensions& initialDimensions, uint32_t initialLength,
288                                  uint32_t alignment, uint32_t padding) {
289     VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
290                     << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
291                     << ", initialDimensions = " << toString(initialDimensions)
292                     << ", initialLength = " << initialLength << ", alignment = " << alignment
293                     << ", padding = " << padding << ")";
294     CHECK(!mDeclared);
295     CHECK_GT(initialLength, 0u);
296     const uint32_t paddedLength = roundUp(initialLength, padding);
297     auto [_, isNew] = mSourceOperandToTemporary.emplace(
298             sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions,
299                                                          paddedLength, alignment, padding});
300     CHECK(isNew);
301     mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
302 }
303 
redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)304 bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
305                                    const Dimensions& newDimensions, uint32_t newLength) {
306     auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
307         VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
308                         << toString(sourceOperandIndex)
309                         << ", newDimensions = " << toString(newDimensions)
310                         << ", newLength = " << newLength << ") -> " << toString(changedShape);
311         return changedShape;
312     };
313 
314     CHECK(mDeclared);
315     CHECK_GT(newLength, 0u);
316 
317     InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
318     const uint32_t paddedLength = roundUp(newLength, temp.padding);
319     if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) {
320         return createAndLogResult(false);
321     }
322     if (temp.paddedLength < paddedLength) {
323         // Otherwise allocation remains valid, even if it may be suboptimal
324         // (because it uses more space than needed).  Use case: Don't force
325         // client to allocate again just because the client reported more
326         // accurate shape information.
327         mAllocatedStepIndexes.erase(temp.stepIndex);
328     }
329     temp.paddedLength = paddedLength;
330     temp.dimensions = newDimensions;
331     return createAndLogResult(true);
332 }
333 
allocate(uint32_t stepIndex)334 int DynamicTemporaries::allocate(uint32_t stepIndex) {
335     VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
336 
337     CHECK(mDeclared);
338 
339     const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
340     if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
341         return ANEURALNETWORKS_NO_ERROR;
342     }
343 
344     // perform layout
345     uint32_t newSize = 0;
346     for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) {
347         InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
348         // temp.paddedLength is already padded in declare and redeclare.
349         CHECK(temp.paddedLength % temp.padding == 0);
350         temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset;
351     }
352 
353     // perform (re-)allocation
354     // TODO: Today we may shrink the allocation in order to avoid wasting memory.  Is this important
355     //       to conserve memory, or do we waste time reallocating?
356     const double kWaste = 0.2 /* arbitrary */;  // Willing to waste space to avoid
357                                                 // deallocation/reallocation overhead
358     auto& memory = mStepIndexToMemory[stepIndex];
359     const uint32_t oldSize = (memory ? memory->getSize() : 0);
360     if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
361         // Suitable allocation already exists; nothing to do
362     } else {
363         int n;
364         std::tie(n, memory) = MemoryAshmem::create(newSize);
365         if (n != ANEURALNETWORKS_NO_ERROR) {
366             LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
367                        << " for step " << stepIndex;
368             mAllocatedStepIndexes.erase(stepIndex);
369             return n;
370         }
371     }
372 
373     mAllocatedStepIndexes.insert(stepIndex);
374     return ANEURALNETWORKS_NO_ERROR;
375 }
376 
allocated(uint32_t stepIndex) const377 bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
378     return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
379             mStepIndexToSourceOperandIndexes.end()) ||
380            mAllocatedStepIndexes.count(stepIndex);
381 }
382 
lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const383 std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
384         SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
385     CHECK(mDeclared);
386     if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
387         it != mSourceOperandToTemporary.end()) {
388         const InternalLocationAndShape& temp = it->second;
389         const bool isAllocated = allocated(temp.stepIndex);
390         if (mustBeAllocated) {
391             CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
392                                << " must be allocated";
393         }
394         if (isAllocated) {
395             return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
396                                     &temp.dimensions, temp.paddedLength};
397         } else {
398             return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength};
399         }
400     }
401     return std::nullopt;
402 }
403 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)404 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
405                              std::shared_ptr<Device> device)
406     : mPlan(plan),
407       mIndex(stepIndex),
408       mSourceModelIndex(sourceModelIndex),
409       mStepModel(),
410       mDevice(device),
411       mToken(plan->getCacheToken()) {}
412 
413 // Adds an operand if it has not been added already.
414 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)415 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
416                               OperandKind kind) {
417     // Have we added this operand already?
418     auto i = mOperandMap.find(sourceOperandIndex);
419     if (i != mOperandMap.end()) {
420         CHECK(kind == INPUT);
421         *stepOperandIndex = i->second;
422         return ANEURALNETWORKS_NO_ERROR;
423     }
424 
425     // First time we add this operand.
426     *stepOperandIndex = mStepModel.operandCount();
427     mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
428 
429     // Add the operand to the step model.
430     const ModelBuilder& sourceModel = *getSourceModel();
431     const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
432     ANeuralNetworksOperandType type = {
433             .type = static_cast<int32_t>(operand.type),
434             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
435             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
436             .scale = operand.scale,
437             .zeroPoint = operand.zeroPoint,
438     };
439 
440     int n = mStepModel.addOperand(type);
441     if (n != ANEURALNETWORKS_NO_ERROR) {
442         LOG(ERROR) << "Previous error occurred when partitioning the graph";
443         return n;
444     }
445 
446     n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
447     if (n != ANEURALNETWORKS_NO_ERROR) {
448         LOG(ERROR) << "Error when copying extra parameters to the operand";
449         return n;
450     }
451 
452     // Sets its value.
453     switch (operand.lifetime) {
454         case Operand::LifeTime::CONSTANT_COPY: {
455             const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
456             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
457         } break;
458         case Operand::LifeTime::CONSTANT_REFERENCE: {
459             const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex];
460             n = mStepModel.setOperandValueFromMemory(
461                     *stepOperandIndex, memory, operand.location.offset, operand.location.length);
462         } break;
463         case Operand::LifeTime::NO_VALUE: {
464             n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
465         } break;
466         case Operand::LifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
467             if (kind == INPUT) {
468                 // The first time we've seen this operand is as an
469                 // input.  That means it must be defined by a
470                 // different partition, and is an input to this one.
471                 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
472             } else {
473                 // The first time we've seen this operand is as an
474                 // output.  It may be an input to a different
475                 // partition, so keep track of it.
476                 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
477                                           mIndex);
478             }
479         } break;
480         case Operand::LifeTime::SUBGRAPH_INPUT: {
481             mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
482         } break;
483         case Operand::LifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
484             if (kind == INPUT) {
485                 // The first time we've seen this operand is as an
486                 // input.  That means it must be defined by a
487                 // different partition, and is an input to this one.
488                 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
489             } else {
490                 // The first time we've seen this operand is as an
491                 // output.
492                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
493                 // It may be an input to a different partition, so keep track of
494                 // it.
495                 mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
496                                        mIndex);
497             }
498         } break;
499         case Operand::LifeTime::SUBGRAPH: {
500             const ModelBuilder* model = sourceModel.getReferencedModel(operand);
501             n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
502         } break;
503         case Operand::LifeTime::POINTER: {
504             const void* data = std::get<const void*>(operand.location.pointer);
505             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
506         } break;
507     }
508 
509     if (n != ANEURALNETWORKS_NO_ERROR) {
510         LOG(ERROR) << "Previous error occurred when partitioning the graph";
511     }
512     return n;
513 }
514 
addOperation(int operationIndex)515 int ExecutionStep::addOperation(int operationIndex) {
516     const Operation& operation = getSourceModel()->getOperation(operationIndex);
517     if (mToken.ok()) {
518         mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
519         mToken.update(&operationIndex, sizeof(operationIndex));
520     }
521 
522     // Convert the input and output operand indexes.
523     //
524     // We expect operations to be added in topological order.  Therefore:
525     //
526     // - We may not have seen an input if it is a model input, a
527     //   constant, or an operand written by a different partition.
528     //
529     // - We should not have seen any outputs.
530     auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands,
531                               std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
532         const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
533         for (uint32_t i = 0; i < operandCount; i++) {
534             NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
535         }
536         return ANEURALNETWORKS_NO_ERROR;
537     };
538 
539     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
540     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
541     std::vector<uint32_t> inputs(inputCount);
542     std::vector<uint32_t> outputs(outputCount);
543     NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
544     NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
545     return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
546                                    outputCount, outputs.data());
547 }
548 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const549 void ExecutionStep::mapInputsAndOutputs(
550         std::shared_ptr<StepExecutor> executor,
551         const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory,
552         const std::map<SourceOperandIndex, StaticTemporaryLocation>&
553                 sourceOperandToLocationOfTemporary,
554         const DynamicTemporaries& dynamicTemporaries,
555         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
556         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
557         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
558                 sourceOperandToConstantReference) const {
559     auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
560         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
561         if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
562             it != sourceOperandToLocationOfTemporary.end()) {
563             const auto& loc = it->second;
564             executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset,
565                                          loc.paddedLength);
566         } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
567             executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset,
568                                          loc->paddedLength, *loc->dimensions);
569         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
570                    it != sourceOperandToInputIndex.end()) {
571             executor->mapInput(it->second, stepInputIndex);
572         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
573                    it != sourceOperandToOutputIndex.end()) {
574             executor->mapOutputToInput(it->second, stepInputIndex,
575                                        mainModelOutputShapes
576                                                ? &mainModelOutputShapes->at(it->second).dimensions
577                                                : nullptr);
578         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
579                    it != sourceOperandToConstantReference.end()) {
580             // Constant partition boundary operand. This could be an IF branch
581             // model input or a WHILE variable initializer.
582             const auto& loc = it->second;
583             executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length);
584         } else {
585             CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
586                          << toString(sourceOperandIndex);
587         }
588     };
589     auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
590         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
591         if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
592             it != sourceOperandToLocationOfTemporary.end()) {
593             const auto& loc = it->second;
594             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset,
595                                           loc.paddedLength);
596         } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
597             executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
598                                           loc->paddedLength, *loc->dimensions);
599         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
600                    it != sourceOperandToOutputIndex.end()) {
601             executor->mapOutput(it->second, stepOutputIndex);
602         } else {
603             CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
604                          << toString(sourceOperandIndex);
605         }
606     };
607     for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
608         mapInput(mStepModelInputs[i].first, i);
609     }
610     for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
611         mapOutput(mStepModelOutputs[i].first, i);
612     }
613 }
614 
findModelOutputsThatAreDownstreamInputs()615 void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
616     auto declareModelOutputIsDownstreamInput =
617             [this](const SourceOperandIndex& sourceOperandIndex) {
618                 const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
619                 CHECK(it != mOutputToDefiningExecutionStep.end());
620                 uint32_t stepIndex = it->second;
621                 CHECK_LT(stepIndex, mSteps.size());
622                 VLOG(COMPILATION)
623                         << "ExecutionStep(" << stepIndex
624                         << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
625                         << toString(sourceOperandIndex) << ")";
626                 CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
627                       mSourceOperandToOutputIndex.end());
628                 mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
629                         mSourceOperandToOutputIndex.at(sourceOperandIndex));
630             };
631     for (const auto& logicalStep : mSteps) {
632         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
633             for (const auto& output : step->getOutputsAsStepModelInputs()) {
634                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
635                 declareModelOutputIsDownstreamInput(sourceOperandIndex);
636             }
637         }
638     }
639 }
640 
findTempsAsStepModelOutputs()641 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
642     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
643         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
644         if (it == mTemporaryToDefiningExecutionStep.end()) {
645             // The operand is not a temporary or is not defined by an
646             // ExecutionStep (i.e. it's an output of an IF or a WHILE).
647             // The latter case is handled by ExecutionPlan::makeController().
648             return;
649         }
650         uint32_t stepIndex = it->second;
651         CHECK_LT(stepIndex, mSteps.size());
652         mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
653     };
654     for (const auto& logicalStep : mSteps) {
655         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
656             for (const auto& input : step->getTempsAsStepModelInputs()) {
657                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
658                 recordAsOutputIfTemporary(sourceOperandIndex);
659             }
660         } else if (const IfStep* step = logicalStep->tryIfStep()) {
661             recordAsOutputIfTemporary(step->conditionOperandIndex);
662             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
663                 recordAsOutputIfTemporary(sourceOperandIndex);
664             }
665         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
666             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
667                 recordAsOutputIfTemporary(sourceOperandIndex);
668             }
669         } else {
670             CHECK(logicalStep->isGoto());
671         }
672     }
673 }
674 
declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)675 void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
676     VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
677                       << mainModelOutputIndex << ")";
678     const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
679                               mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
680     CHECK(it != mOutputIndexStepModelToMainModel.end());
681     const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
682     CHECK(stepModelOutputIndex < mModelOutputs.size());
683     mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
684 }
685 
recordTempAsStepModelOutput(uint32_t stepOperandIndex)686 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
687     const auto it = mOperandMap.find(stepOperandIndex);
688     CHECK(it != mOperandMap.end());
689     mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
690 }
691 
getSourceModel() const692 const ModelBuilder* ExecutionStep::getSourceModel() const {
693     return mPlan->getSourceModels().getModel(mSourceModelIndex);
694 }
695 
logStepModel() const696 void ExecutionStep::logStepModel() const {
697     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
698 
699     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
700         if (!toLog.empty()) {
701             toLog += ", ";
702         }
703         toLog += toString(e.first);
704         toLog += "->";
705         toLog += toString(e.second);
706     };
707 
708     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
709         std::string toLog;
710         for (const auto& e : map) {
711             logRemapEntry(toLog, e);
712         }
713         VLOG(COMPILATION) << name << ": " << toLog;
714     };
715     auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
716         std::string toLog;
717         for (const auto& e : set) {
718             logRemapEntry(toLog, e);
719         }
720         VLOG(COMPILATION) << name << ": " << toLog;
721     };
722 
723     logRemapVector("step model inputs", mStepModelInputs);
724     logRemapVector("step model outputs", mStepModelOutputs);
725     logRemapVector("model inputs", mModelInputs);
726     logRemapVector("model outputs", mModelOutputs);
727     logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
728     logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
729     logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
730 }
731 
hasUnknownSize(const Operand & operand)732 static bool hasUnknownSize(const Operand& operand) {
733     if (operand.dimensions.empty()) {
734         return TypeManager::get()->isTensorType(operand.type);
735     }
736     for (const Dimension& dimension : operand.dimensions) {
737         if (dimension == 0) {
738             return true;
739         }
740     }
741     return false;
742 }
743 
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)744 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
745                                    int32_t executionPreference, int32_t priority) {
746     CHECK(mDevice != nullptr);
747 
748     for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
749         const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
750         if (hasUnknownSize(operand)) {
751             *hasOutputOfUnknownSize = true;
752             VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
753                               << " of source graph) has unknown size: " << operand;
754         }
755     }
756 
757     mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
758 
759     mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
760     mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
761                             mTempsAsStepModelInputs.end());
762     mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
763                             mOutputsAsStepModelInputs.end());
764 
765     mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
766     mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
767                              mTempsAsStepModelOutputs.end());
768 
769     // A step model with no inputs or no outputs is an invalid model. Note that we would like to
770     // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from
771     // model validation.
772     if (hasNoInputsOrNoOutputs()) {
773         VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs "
774                              "or no outputs";
775         return ANEURALNETWORKS_OP_FAILED;
776     }
777 
778     if (mSourceModelIndex == kMainModelInSourceModels) {
779         std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
780         for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
781             mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
782         }
783         std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
784         for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
785             mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
786         }
787 
788         // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
789         // mModelInputs being the first inputs, as specified by mStepModelInputs.
790         mInputIndexStepModelToMainModel.resize(mModelInputs.size());
791         std::transform(mModelInputs.begin(), mModelInputs.end(),
792                        mInputIndexStepModelToMainModel.begin(),
793                        [&mainModelOperandToInputIndex](auto& e) {
794                            uint32_t sourceOperandIndex = e.first;
795                            return mainModelOperandToInputIndex[sourceOperandIndex];
796                        });
797 
798         // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
799         // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
800         mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
801         std::transform(mModelOutputs.begin(), mModelOutputs.end(),
802                        mOutputIndexStepModelToMainModel.begin(),
803                        [&mainModelOperandToOutputIndex](auto& e) {
804                            uint32_t sourceOperandIndex = e.first;
805                            return mainModelOperandToOutputIndex[sourceOperandIndex];
806                        });
807 
808         // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
809         // on mOutputsAsStepModelInputs being the first outputs.
810         mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
811         std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
812                        mOutputsAsStepModelInputsIndexToMainModel.begin(),
813                        [&mainModelOperandToOutputIndex](auto& e) {
814                            uint32_t sourceOperandIndex = e.first;
815                            return mainModelOperandToOutputIndex[sourceOperandIndex];
816                        });
817     }
818 
819     if (VLOG_IS_ON(COMPILATION)) {
820         logStepModel();
821     }
822 
823     std::vector<uint32_t> inputs(mStepModelInputs.size());
824     std::vector<uint32_t> outputs(mStepModelOutputs.size());
825     std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
826                    [](auto& e) { return e.second; });
827     std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
828                    [](auto& e) { return e.second; });
829     NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
830                                                            outputs.size(), outputs.data()));
831     NN_RETURN_IF_ERROR(mStepModel.finish());
832 
833     // TODO: Move compilation elsewhere?
834     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
835     return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(),
836                    &mToken, &mPreparedStepModel);
837 }
838 
dump() const839 void ExecutionStep::dump() const {
840     if (VLOG_IS_ON(COMPILATION)) {
841         VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
842         logModelToInfo(mStepModel.makeModel());
843     }
844 }
845 
operator <<(std::ostream & os,const IfStep & step)846 std::ostream& operator<<(std::ostream& os, const IfStep& step) {
847     return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
848               << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
849 }
850 
operator <<(std::ostream & os,const WhileStep & step)851 std::ostream& operator<<(std::ostream& os, const WhileStep& step) {
852     return os << "Step#" << step.index << ": while cond=" << step.condStepIndex
853               << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
854 }
855 
operator <<(std::ostream & os,const GotoStep & step)856 std::ostream& operator<<(std::ostream& os, const GotoStep& step) {
857     return os << "Step#" << step.index << ": goto " << step.gotoStepIndex;
858 }
859 
dump() const860 void LogicalStep::dump() const {
861     if (VLOG_IS_ON(COMPILATION)) {
862         if (const IfStep* step = tryIfStep()) {
863             VLOG(COMPILATION) << *step;
864         } else if (const WhileStep* step = tryWhileStep()) {
865             VLOG(COMPILATION) << *step;
866         } else if (const GotoStep* step = tryGotoStep()) {
867             VLOG(COMPILATION) << *step;
868         } else {
869             executionStep()->dump();
870         }
871     }
872 }
873 
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)874 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
875                                         int32_t executionPreference, int32_t priority,
876                                         const OptionalTimePoint& deadline,
877                                         int simulateFailureResultCode) {
878     CHECK(!mSuccessfulFinish);
879     CHECK(!deadline.has_value());
880     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
881 
882     auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
883         for (const auto& sourceOperandIndex : operands) {
884             const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
885             const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
886             if (hasUnknownSize(operand)) {
887                 return true;
888             }
889         }
890         return false;
891     };
892 
893     findTempsAsStepModelOutputs();
894     for (const auto& logicalStep : mSteps) {
895         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
896             bool stepHasDynamicTemporaries = false;
897             int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries,
898                                           executionPreference, priority);
899             if (stepHasDynamicTemporaries) {
900                 mHasDynamicTemporaries = true;
901                 if (step->getDevice()->getFeatureLevel() < kHalVersionV1_2ToApi.featureLevel) {
902                     // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT
903                     // must have fully specified dimensions either in the
904                     // Operand or in the RequestArgument.  In the case of a
905                     // dynamic temporary, we won't be able to supply fully
906                     // specified dimensions in either.
907                     VLOG(COMPILATION)
908                             << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex()
909                             << " defines dynamic temporaries but is scheduled on pre-1.2 device "
910                             << step->getDevice()->getName();
911                     if (n == ANEURALNETWORKS_NO_ERROR) {
912                         n = ANEURALNETWORKS_OP_FAILED;
913                     }
914                 }
915             }
916             if (n != ANEURALNETWORKS_NO_ERROR) {
917                 VLOG(COMPILATION)
918                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
919                 return n;
920             }
921         } else if (IfStep* step = logicalStep->tryIfStep()) {
922             // The partitioner does not support dynamic temporaries (b/132458982).
923             CHECK(!containsUnknownSize(step->outerInputOperands));
924             CHECK(!containsUnknownSize(step->outerOutputOperands));
925             // step->conditionOperandIndex has a static shape. See b/158557728.
926             CHECK(!containsUnknownSize(step->thenBranchInputOperands));
927             CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
928             CHECK(!containsUnknownSize(step->elseBranchInputOperands));
929             CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
930         } else if (WhileStep* step = logicalStep->tryWhileStep()) {
931             // The partitioner does not support dynamic temporaries (b/132458982).
932             CHECK(!containsUnknownSize(step->outerInputOperands));
933             CHECK(!containsUnknownSize(step->outerOutputOperands));
934             CHECK(!containsUnknownSize(step->condInputOperands));
935             // step->condOutputOperand has a static shape. See b/158557728.
936             CHECK(!containsUnknownSize(step->bodyInputOperands));
937             CHECK(!containsUnknownSize(step->bodyOutputOperands));
938         } else {
939             CHECK(logicalStep->isGoto());
940         }
941     }
942 
943     if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
944         VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
945                           << simulateFailureResultCode;
946         return simulateFailureResultCode;
947     }
948 
949     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
950         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
951         mSourceOperandToInputIndex[index] = i;
952     }
953     for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
954         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
955         mSourceOperandToOutputIndex[index] = i;
956     }
957 
958     findControlFlowBoundaryConstants(sourceModels);
959     findModelOutputsThatAreDownstreamInputs();
960     findMemoryStepRoles();
961 
962     mSuccessfulFinish = true;
963     return ANEURALNETWORKS_NO_ERROR;
964 }
965 
findControlFlowBoundaryConstants(const SourceModels * sourceModels)966 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
967         const SourceModels* sourceModels) {
968     auto handleBoundaryConstants = [this,
969                                     sourceModels](const SourceOperandIndex& sourceOperandIndex) {
970         const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
971         const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
972         const DataLocation& location = operand.location;
973         if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) {
974             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
975                     .buffer = sourceModel->getPointerToOperandValue(location.offset),
976                     .length = location.length,
977             };
978         } else if (operand.lifetime == Operand::LifeTime::POINTER) {
979             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
980                     .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)),
981                     .length = location.length,
982             };
983         } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) {
984             mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
985                     .memory = sourceModel->getMemories()[location.poolIndex],
986                     .offset = location.offset,
987                     .length = location.length,
988             };
989         }
990     };
991     for (const auto& logicalStep : mSteps) {
992         if (const IfStep* step = logicalStep->tryIfStep()) {
993             handleBoundaryConstants(step->conditionOperandIndex);
994             for (const auto& sourceOperandIndex : step->outerInputOperands) {
995                 handleBoundaryConstants(sourceOperandIndex);
996             }
997         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
998             for (const auto& sourceOperandIndex : step->outerInputOperands) {
999                 handleBoundaryConstants(sourceOperandIndex);
1000             }
1001         }
1002     }
1003 }
1004 
findMemoryStepRoles()1005 void ExecutionPlan::CompoundBody::findMemoryStepRoles() {
1006     mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) {
1007         for (const auto& logicalStep : mSteps) {
1008             if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1009                 const auto& stepModelInputs = step->getStepModelInputs();
1010                 for (uint32_t i = 0; i < stepModelInputs.size(); i++) {
1011                     const auto& [sourceIndex, stepIndex] = stepModelInputs[i];
1012                     analyzer.addRole(*step, sourceIndex, IOType::INPUT, i);
1013                 }
1014                 const auto& stepModelOutputs = step->getStepModelOutputs();
1015                 for (uint32_t i = 0; i < stepModelOutputs.size(); i++) {
1016                     const auto& [sourceIndex, stepIndex] = stepModelOutputs[i];
1017                     analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i);
1018                 }
1019             } else if (const IfStep* step = logicalStep->tryIfStep()) {
1020                 // See ExecutionPlan::nextCompound(const IfStep*, ...).
1021                 //
1022                 // For interpreted IF operation, the outer input memories may be directly used by
1023                 // the SUBGRAPH_INPUTs of the then and else model.
1024                 CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size());
1025                 CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size());
1026                 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1027                     analyzer.setUsedBy(step->outerInputOperands[i],
1028                                        step->thenBranchInputOperands[i]);
1029                     analyzer.setUsedBy(step->outerInputOperands[i],
1030                                        step->elseBranchInputOperands[i]);
1031                 }
1032                 // For interpreted IF operation, the outer output memories may be directly used by
1033                 // the SUBGRAPH_OUTPUTs of the then and else model.
1034                 CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size());
1035                 CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size());
1036                 for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) {
1037                     analyzer.setUsedBy(step->outerOutputOperands[i],
1038                                        step->thenBranchOutputOperands[i]);
1039                     analyzer.setUsedBy(step->outerOutputOperands[i],
1040                                        step->elseBranchOutputOperands[i]);
1041                 }
1042             } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1043                 // See ExecutionPlan::nextCompound(const WhileStep*, ...).
1044                 //
1045                 // For interpreted WHILE operation, the following memories are involved:
1046                 // a. the outer input memories to the WHILE operation
1047                 // b. the outer output memories to the WHILE operation
1048                 // c. the output memory of the condition model
1049                 // d. one set of output memories of the body model
1050                 // e. another set of output memories of the body model
1051                 //
1052                 // The memories are used in the following ways:
1053                 //
1054                 // - Condition model:
1055                 //   * In the first iteration: inputs use (a); output uses (c)
1056                 //   * In the following iterations: inputs use (d) or (e) for input-output and
1057                 //     state-only operands, and (a) for input-only operands; output uses (c)
1058                 //
1059                 // - Body model:
1060                 //   * In all iterations: inputs are the same as the condition model; outputs use
1061                 //                        (d) or (e)
1062                 //
1063                 // Therefore, we configure the analyzer with the following used-by relationships:
1064                 // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of
1065                 //   the condition model for all inputs in the first iteration, as well as the
1066                 //   input-only operands in the following iterations.
1067                 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1068                 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1069                     analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]);
1070                 }
1071                 // - The output memories of the body model (d) and (e) may be directly used by the
1072                 //   SUBGRAPH_INPUTs of the condition model for input-output and state-only operands
1073                 //   after the first iteration.
1074                 CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size());
1075                 for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) {
1076                     analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]);
1077                 }
1078                 // - The SUBGRAPH_INPUTs of the condition model are directly used by the
1079                 //   SUBGRAPH_INPUTs of the body model for all inputs in all iterations.
1080                 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1081                 for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) {
1082                     analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]);
1083                 }
1084             } else if (logicalStep->isGoto()) {
1085                 // Nothing to do.
1086             } else {
1087                 CHECK(false) << "Unexpected LogicalStep kind";
1088             }
1089         }
1090     });
1091 }
1092 
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)1093 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
1094                                       int32_t priority, const OptionalTimePoint& deadline,
1095                                       int simulateFailureResultCode) {
1096     CHECK(!mSuccessfulFinish);
1097     CHECK(mDevice != nullptr);
1098     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
1099     int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo,
1100                     &mToken, &mPreparedModel);
1101     if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
1102         VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
1103                           << simulateFailureResultCode;
1104         n = simulateFailureResultCode;
1105     }
1106     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
1107     return n;
1108 }
1109 
finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)1110 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
1111                           const OptionalTimePoint& deadline, int simulateFailureResultCode) {
1112     CHECK(mBody != nullptr);
1113     return mBody->finish(&getSourceModels(), executionPreference, priority, deadline,
1114                          simulateFailureResultCode);
1115 }
1116 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1117 ExecutionPlan::Controller::Controller(
1118         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
1119         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
1120         std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary,
1121         std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,
1122         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
1123         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
1124         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
1125         std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
1126         DynamicTemporaries dynamicTemporaries)
1127     : mPlan(plan),
1128       mExecutionBuilder(executionBuilder),
1129       mBurstBuilder(burstBuilder),
1130       mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)),
1131       mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)),
1132       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
1133       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
1134       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
1135       mDynamicTemporaries(std::move(dynamicTemporaries)),
1136       mNextStepIndex(0),
1137       mFallbackNextStepIndex(kBadStepIndex),
1138       mLastStepSyncFd(-1) {
1139     if (totalSizeOfTemporaries == 0) {
1140         return;
1141     }
1142     int n;
1143     std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
1144     if (n != ANEURALNETWORKS_NO_ERROR) {
1145         LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
1146         mNextStepIndex = kBadStepIndex;
1147     }
1148     for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
1149         memcpy(mTemporaries->getPointer() +
1150                        mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset,
1151                location.buffer, location.length);
1152     }
1153 }
1154 
1155 // Attempt to create a burst object for each PreparedModel/Partition. If the
1156 // burst controller object cannot be made, return a nullptr in its place to
1157 // indicate the regular execution path should be used. This can occur either
1158 // because PreparedModel was nullptr (cpu was best choice), or because the
1159 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const1160 std::vector<SharedBurst> ExecutionPlan::makeBursts() const {
1161     switch (mState) {
1162         // burst object for each partition in the compound case
1163         case COMPOUND: {
1164             std::vector<SharedBurst> bursts;
1165             bursts.reserve(compound()->mSteps.size());
1166             for (const auto& logicalStep : compound()->mSteps) {
1167                 if (!logicalStep->isExecution()) {
1168                     bursts.push_back(nullptr);
1169                     continue;
1170                 }
1171                 if (const auto preparedModel =
1172                             logicalStep->executionStep()->getPreparedStepModel()) {
1173                     const auto maybeBurst = preparedModel->configureExecutionBurst();
1174                     if (!maybeBurst.has_value()) {
1175                         LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1176                                    << maybeBurst.error().code << ": " << maybeBurst.error().message;
1177                     }
1178                     bursts.push_back(maybeBurst.value_or(nullptr));
1179                 } else {
1180                     bursts.push_back(nullptr);
1181                 }
1182             }
1183             return bursts;
1184         }
1185         // single burst object for the simple case
1186         case SIMPLE: {
1187             std::vector<SharedBurst> burst;
1188             auto simpleBody = simple();
1189             if (const auto preparedModel = simpleBody->mPreparedModel) {
1190                 const auto maybeBurst = preparedModel->configureExecutionBurst();
1191                 if (!maybeBurst.has_value()) {
1192                     LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1193                                << maybeBurst.error().code << ": " << maybeBurst.error().message;
1194                 }
1195                 burst.push_back(maybeBurst.value_or(nullptr));
1196             } else {
1197                 burst.push_back(nullptr);
1198             }
1199             return burst;
1200         }
1201         // no burst objects made
1202         default:
1203             return {};
1204     }
1205 }
1206 
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1207 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
1208         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
1209     CHECK(isValid());
1210     CHECK(mState != SIMPLE);
1211     const auto* body = compound();
1212     // Create the layout for a RuntimeMemory object big enough to hold
1213     // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
1214     // - buffers required by the control flow implementation.
1215     //
1216     // TODO: Rethink this approach for managing temporaries.  Some
1217     // alternatives:
1218     //
1219     // 1) Adopt a memory layout scheme analogous to stack allocation,
1220     // where objects of non-overlapping lifetime can occupy the same
1221     // storage.  We would still have a single Memory object in this
1222     // case.
1223     //
1224     // 2) Do something like what CpuExecutor does, and do allocations
1225     // and deallocations on the fly (during execution) before first
1226     // reference and after last reference, respectively.  This would
1227     // mean having one Memory object per TEMPORARY; or, in a more
1228     // complicated implementation, one Memory object per set of
1229     // temporaries that have the same lifetime.  Note that the Android
1230     // system limits the number of shared memory objects, which are
1231     // what our Memory objects represent.
1232     //
1233     uint32_t totalSizeOfTemporaries = 0;
1234     // This function has two modes of operation:
1235     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
1236     //    TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
1237     //    skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
1238     //    skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
1239     //    operand of another lifetime.
1240     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
1241     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
1242     //    of another lifetime.
1243     auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries](
1244                                 const SourceOperandIndex& sourceOperandIndex,
1245                                 std::map<SourceOperandIndex, StaticTemporaryLocation>*
1246                                         sourceOperandToLocationOfTemporary,
1247                                 Operand::LifeTime lifetime =
1248                                         Operand::LifeTime::TEMPORARY_VARIABLE) {
1249         CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
1250               lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT);
1251         const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex);
1252         if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE &&
1253             sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
1254             // See the caller for explanation.
1255             return;
1256         }
1257         CHECK_EQ(sourceOperand.lifetime, lifetime);
1258         const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1259         if (size != 0u) {
1260             const auto memoryPreference =
1261                     body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1262             const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment,
1263                                           memoryPreference.padding);
1264             auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc);
1265             CHECK(isNew);
1266             VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
1267                             << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1268         } else {
1269             // Unknown size, hence dynamic temporary.  The mapping will
1270             // be established elsewhere (DynamicTemporaries::allocate()).
1271             CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1272             CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1273         }
1274     };
1275     std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary;
1276     std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2;
1277     for (const auto& logicalStep : body->mSteps) {
1278         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1279             // Allocate memory for ExecutionStep temporary outputs that are
1280             // inputs to other steps, as determined by
1281             // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
1282             //
1283             // We don't allocate memory for step model output operands with
1284             // source operand lifetime SUBGRAPH_OUTPUT because they will be
1285             // - managed by the client (main model outputs),
1286             // - assigned a location of another operand (when this step model
1287             //   output is a branch model output of an IF; see
1288             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1289             // - allocated by a WHILE (when this step model output
1290             //   is a condition or body model output of a WHILE; see the
1291             //   step->bodyOutputOperands and step->condOutputOperand handling
1292             //   below).
1293             for (const auto& output : step->getTempsAsStepModelOutputs()) {
1294                 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
1295                              &sourceOperandToLocationOfTemporary);
1296             }
1297         } else if (const IfStep* step = logicalStep->tryIfStep()) {
1298             // Allocate memory for all temporary outputs of an IfStep because
1299             // they are going to be written to by a branch model. We don't
1300             // perform unused output operand optimisation for referenced models.
1301             //
1302             // We don't allocate memory for branch output operands because they
1303             // use the same location as the corresponding outer output operands,
1304             // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
1305             //
1306             // We don't allocate memory for outer output operands with source
1307             // operand lifetime SUBGRAPH_OUTPUT because they will be
1308             // - managed by the client (main model outputs),
1309             // - assigned a location of another operand (when this IF outer
1310             //   output is a branch model output of another IF; see
1311             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1312             // - allocated by a WHILE (when this IF outer output
1313             //   is a condition or body model output of a WHILE; see the
1314             //   step->bodyOutputOperands and step->condOutputOperand handling
1315             //   below).
1316             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1317                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1318             }
1319         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1320             // Allocate memory for all temporary outputs of an WhileStep because
1321             // they are going to be written to by the WHILE loop.
1322             //
1323             // We don't allocate memory for outer output operands with source
1324             // operand lifetime SUBGRAPH_OUTPUT because they will be
1325             // - managed by the client (main model outputs),
1326             // - assigned a location of another operand (when this WHILE outer
1327             //   output is a branch model output of an IF; see
1328             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1329             // - allocated by another WHILE (when this WHILE outer output
1330             //   is a condition or body model output of another WHILE; see the
1331             //   step->bodyOutputOperands and step->condOutputOperand handling
1332             //   below).
1333             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1334                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1335             }
1336             // Allocate memory for body model outputs. Note that we could use
1337             // the outer output operand memory instead but we currently don't do
1338             // so (b/148206073).
1339             for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
1340                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary,
1341                              Operand::LifeTime::SUBGRAPH_OUTPUT);
1342                 // Allocate another set of temporaries for double buffering.
1343                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2,
1344                              Operand::LifeTime::SUBGRAPH_OUTPUT);
1345             }
1346             // Allocate memory for condition model output.
1347             // TODO: Share one condition output memory region between all loops.
1348             mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary,
1349                          Operand::LifeTime::SUBGRAPH_OUTPUT);
1350         } else {
1351             CHECK(logicalStep->isGoto());
1352         }
1353     }
1354     // Allocate temporary memory for boundary CONSTANT_COPY operands.
1355     for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) {
1356         const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1357         const auto loc = addTemporary(&totalSizeOfTemporaries, location.length,
1358                                       memoryPreference.alignment, memoryPreference.padding);
1359         sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc);
1360         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
1361                         << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1362     }
1363     // Collect dynamic temporaries.
1364     // TODO(b/157236079): Move some or all of this work to compilation time?
1365     DynamicTemporaries dynamicTemporaries;
1366     const TypeManager* typeManager = TypeManager::get();
1367     forEachDynamicTemporary([body, typeManager, &dynamicTemporaries](
1368                                     SourceOperandIndex sourceOperandIndex,
1369                                     const Operand& sourceOperand, uint32_t definingStepIndex) {
1370         CHECK(typeManager->isTensorType(sourceOperand.type));
1371         const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1372         // TODO: For now we guess an initial size equal to element
1373         // size, which is overly conservative.
1374         const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
1375         dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
1376                                    size, memoryPreference.alignment, memoryPreference.padding);
1377     });
1378     dynamicTemporaries.endDeclarations();
1379     dynamicTemporaries.vlogDump("finished declarations");
1380 
1381     return std::shared_ptr<Controller>(new Controller(
1382             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
1383             std::move(sourceOperandToLocationOfTemporary),
1384             std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex,
1385             body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy,
1386             body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
1387 }
1388 
1389 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1390 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
1391                             std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1392                             const std::vector<OutputShape>* mainModelOutputShapes) const {
1393     *executor = nullptr;
1394     if (burstController != nullptr) {
1395         *burstController = nullptr;
1396     }
1397 
1398     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
1399                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
1400 
1401     if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
1402         // We haven't called next().
1403         return ANEURALNETWORKS_OP_FAILED;
1404     }
1405 
1406     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1407         // The last call to next() did not produce an executor.
1408         return ANEURALNETWORKS_OP_FAILED;
1409     }
1410 
1411     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1412     return next(controller, executor, burstController, mainModelOutputShapes);
1413 }
1414 
Buffer(void * pointer,uint32_t size)1415 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1416     : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)),
1417       mOffset(0) {}
1418 
Buffer(RunTimePoolInfo info,uint32_t offset)1419 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1420     : mInfo(std::move(info)), mOffset(offset) {}
1421 
getPointer() const1422 void* ExecutionPlan::Buffer::getPointer() const {
1423     return mInfo.getBuffer() + mOffset;
1424 }
1425 
getSize() const1426 uint32_t ExecutionPlan::Buffer::getSize() const {
1427     return mInfo.getSize() - mOffset;
1428 }
1429 
flush() const1430 void ExecutionPlan::Buffer::flush() const {
1431     mInfo.flush();
1432 }
1433 
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1434 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1435         const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1436     switch (info.state()) {
1437         case ModelArgumentInfo::POINTER: {
1438             return Buffer(info.buffer(), info.length());
1439         } break;
1440         case ModelArgumentInfo::MEMORY: {
1441             if (std::optional<RunTimePoolInfo> poolInfo =
1442                         executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1443                 return Buffer(*poolInfo, info.locationAndLength().offset);
1444             } else {
1445                 LOG(ERROR) << "Unable to map operand memory pool";
1446                 return std::nullopt;
1447             }
1448         } break;
1449         case ModelArgumentInfo::HAS_NO_VALUE: {
1450             LOG(ERROR) << "Attempting to read an operand that has no value";
1451             return std::nullopt;
1452         } break;
1453         default: {
1454             LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1455             return std::nullopt;
1456         } break;
1457     }
1458 }
1459 
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1460 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1461         std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1462     const auto& sourceOperandToLocationOfTemporary =
1463             controller->mSourceOperandToLocationOfTemporary;
1464     const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1465     const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1466     const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1467     if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex);
1468         it != sourceOperandToLocationOfTemporary.end()) {
1469         const uint32_t offset = it->second.offset;
1470         const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1471         return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1472     } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1473                it != sourceOperandToInputIndex.end()) {
1474         const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1475         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1476     } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1477                it != sourceOperandToOutputIndex.end()) {
1478         const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1479         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1480     } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1481                it != sourceOperandToConstantReference.end()) {
1482         const ConstantReferenceLocation& location = it->second;
1483         const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1484         if (info == std::nullopt) {
1485             return std::nullopt;
1486         }
1487         return Buffer(info->getBuffer() + location.offset, location.length);
1488     }
1489     return std::nullopt;
1490 }
1491 
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1492 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1493                                       SourceOperandIndex operandIndex, bool* value) const {
1494     std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1495     if (buffer == std::nullopt) {
1496         LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1497         return ANEURALNETWORKS_OP_FAILED;
1498     }
1499     CHECK_GE(buffer->getSize(), sizeof(bool8));
1500     bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1501     *value = static_cast<bool>(value8);
1502     VLOG(EXECUTION) << "readConditionValue: " << *value;
1503     return ANEURALNETWORKS_NO_ERROR;
1504 }
1505 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1506 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1507                         std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1508                         const std::vector<OutputShape>* mainModelOutputShapes,
1509                         int syncFdOfLastStep) const {
1510     CHECK(mState == COMPOUND);
1511 
1512     controller->mLastStepSyncFd = syncFdOfLastStep;
1513     *executor = nullptr;
1514     if (burstController != nullptr) {
1515         *burstController = nullptr;
1516     }
1517 
1518     VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1519                     << "): mNextStepIndex = " << controller->mNextStepIndex;
1520 
1521     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1522         return ANEURALNETWORKS_OP_FAILED;
1523     }
1524 
1525     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1526 }
1527 
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1528 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1529                                 std::shared_ptr<StepExecutor>* executor,
1530                                 SharedBurst* burstController,
1531                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1532     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1533         return ANEURALNETWORKS_OP_FAILED;
1534     }
1535 
1536     auto compoundBody = compound();
1537     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1538         controller->mNextStepIndex = Controller::kBadStepIndex;  // end
1539         return ANEURALNETWORKS_NO_ERROR;
1540     }
1541 
1542     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1543     if (const IfStep* step = logicalStep->tryIfStep()) {
1544         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1545     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1546         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1547     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1548         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1549     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1550         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1551     } else {
1552         CHECK(false) << "Unknown step variant";
1553         return ANEURALNETWORKS_BAD_STATE;
1554     }
1555 }
1556 
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1557 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1558                                 std::shared_ptr<StepExecutor>* executor,
1559                                 SharedBurst* burstController,
1560                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1561     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1562                     << step->getDevice()->getName();
1563 
1564     NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
1565     controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
1566 
1567     *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1568                                                step->getDevice(), step->getPreparedStepModel(),
1569                                                /*reusable=*/false, step,
1570                                                &controller->mDynamicTemporaries);
1571 
1572     step->mapInputsAndOutputs(
1573             *executor, mainModelOutputShapes, controller->mTemporaries.get(),
1574             controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries,
1575             controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
1576             controller->mSourceOperandToConstantReference);
1577     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1578         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1579     }
1580 
1581     controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1582     controller->mNextStepIndex++;
1583     return ANEURALNETWORKS_NO_ERROR;
1584 }
1585 
1586 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1587 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1588                                          const SourceOperandIndex& innerOperand) {
1589     VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1590                     << toString(outerOperand);
1591 #ifdef NN_DEBUGGABLE
1592     CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1593                      mSourceOperandToInputIndex.count(innerOperand) +
1594                      mSourceOperandToOutputIndex.count(innerOperand) +
1595                      mSourceOperandToConstantReference.count(innerOperand),
1596              1u);
1597 #endif
1598     mSourceOperandToLocationOfTemporary.erase(innerOperand);
1599     mSourceOperandToInputIndex.erase(innerOperand);
1600     mSourceOperandToOutputIndex.erase(innerOperand);
1601     mSourceOperandToConstantReference.erase(innerOperand);
1602     if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1603         it != mSourceOperandToLocationOfTemporary.end()) {
1604         mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1605     } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1606                it != mSourceOperandToInputIndex.end()) {
1607         mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1608     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1609                it != mSourceOperandToOutputIndex.end()) {
1610         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1611     } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1612                it != mSourceOperandToConstantReference.end()) {
1613         mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1614     } else {
1615         CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1616                      << " from operand " << toString(outerOperand);
1617     }
1618 }
1619 
1620 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1621 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1622                                           const SourceOperandIndex& innerOperand) {
1623     VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1624                     << toString(outerOperand);
1625 #ifdef NN_DEBUGGABLE
1626     CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1627                      mSourceOperandToOutputIndex.count(innerOperand),
1628              1u);
1629 #endif
1630     mSourceOperandToLocationOfTemporary.erase(innerOperand);
1631     mSourceOperandToOutputIndex.erase(innerOperand);
1632     if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1633         it != mSourceOperandToLocationOfTemporary.end()) {
1634         mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1635     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1636                it != mSourceOperandToOutputIndex.end()) {
1637         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1638     } else {
1639         CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1640                      << " from operand " << toString(outerOperand);
1641     }
1642 }
1643 
waitForLastStepSyncFence() const1644 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1645     if (mLastStepSyncFd == -1) {
1646         return ANEURALNETWORKS_NO_ERROR;
1647     }
1648     VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1649     auto r = syncWait(mLastStepSyncFd, -1);
1650     int n = ANEURALNETWORKS_NO_ERROR;
1651     if (r != FenceState::SIGNALED) {
1652         LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1653         n = ANEURALNETWORKS_OP_FAILED;
1654     }
1655     return n;
1656 }
1657 
1658 // Invocations of Controller::setInput/setOutput in this function must match with invocations of
1659 // StepRoleAnalyzer::setUsedBy in the IfStep branch in
1660 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1661 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1662                                 std::shared_ptr<StepExecutor>* executor,
1663                                 SharedBurst* burstController,
1664                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1665     VLOG(EXECUTION) << "next: " << *step;
1666     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1667     // This is safe because the steps are serialized when doing fenced compute.
1668     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1669     bool condValue;
1670     NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1671     controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1672     const std::vector<SourceOperandIndex>& branchInputOperands =
1673             condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1674     const std::vector<SourceOperandIndex>& branchOutputOperands =
1675             condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1676     CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1677     CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1678     for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1679         // We have to do this assignment just before executing this step to
1680         // accommodate cases when the IF resides within a WHILE condition or
1681         // body model and for some j the i-th input of the IF branch model is
1682         // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1683         // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1684         // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1685         // In such cases, the WhileStep modifies the location of
1686         // step->outerInputOperands[i] to implement double buffering.
1687         controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1688     }
1689     for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1690         // We have to do this assignment just before executing this step to
1691         // accommodate the case when the IF resides within a WHILE body
1692         // model and the i-th output of the IF branch model is an
1693         // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1694         // some j). In that case, the WhileStep modifies the location of
1695         // step->outerOutputOperands[i] to implement double buffering.
1696         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1697     }
1698     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1699 }
1700 
1701 // Invocations of Controller::setInput in this function must match with invocations of
1702 // StepRoleAnalyzer::setUsedBy in the WhileStep branch in
1703 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1704 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1705                                 std::shared_ptr<StepExecutor>* executor,
1706                                 SharedBurst* burstController,
1707                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1708     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1709     if (state.stage == WhileState::EVALUATE_CONDITION) {
1710         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1711         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1712                         << ": evaluating condition";
1713         controller->mNextStepIndex = step->condStepIndex;
1714 
1715         if (state.iteration == 0) {
1716             state.startTime = Clock::now();
1717         }
1718 
1719         // iteration = 0   cond inputs = outer inputs
1720         // iteration = 1   cond inputs = body outputs
1721         // iteration = 2   cond inputs = body outputs
1722         // iteration = 3   cond inputs = ...
1723         uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1724         CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1725         CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1726         for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1727             bool operandIsInputOnly = i >= loopBodyOutputCount;
1728             controller->setInput((state.iteration == 0 || operandIsInputOnly)
1729                                          ? step->outerInputOperands[i]
1730                                          : step->bodyOutputOperands[i],
1731                                  step->condInputOperands[i]);
1732         }
1733 
1734         state.stage = WhileState::EVALUATE_BODY;
1735         return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1736     }
1737 
1738     CHECK(state.stage == WhileState::EVALUATE_BODY);
1739     std::chrono::nanoseconds timeoutDuration(
1740             controller->mExecutionBuilder->getLoopTimeoutDuration());
1741     auto duration = Clock::now() - state.startTime;
1742     if (duration > timeoutDuration) {
1743         LOG(ERROR) << "WHILE loop timed out after "
1744                    << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1745                    << " ms";
1746         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1747     }
1748 
1749     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1750     // This is safe because the steps are serialized when doing fenced compute.
1751     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1752     bool condValue;
1753     NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1754     if (condValue) {
1755         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1756                         << ": evaluating body";
1757         controller->mNextStepIndex = step->bodyStepIndex;
1758 
1759         // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
1760         // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
1761         // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
1762         // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
1763 #ifdef NN_DEBUGGABLE
1764         CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1765         CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1766         CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1767         CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1768 #endif
1769         for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1770             controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1771         }
1772         if (state.iteration != 0) {
1773             for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1774 #ifdef NN_DEBUGGABLE
1775                 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1776                 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1777                 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u);
1778                 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u);
1779 #endif
1780                 std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand],
1781                           controller->mSourceOperandToLocationOfTemporary2[outputOperand]);
1782             }
1783         }
1784     } else {
1785         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1786                         << ": exiting loop";
1787         controller->mNextStepIndex = step->exitStepIndex;
1788 
1789         // Copy body outputs to outer outputs.
1790         // TODO: Use outer outputs instead of tmp2 to avoid copying?
1791         CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1792         for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1793             // condInputOperands[i] points to a body output operand from the
1794             // last iteration if we've executed at least one iteration and to a
1795             // WHILE operation input operand otherwise.
1796             const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1797             const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1798             std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1799             if (outerBuffer == std::nullopt) {
1800                 // This should never happen.
1801                 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1802                 return ANEURALNETWORKS_OP_FAILED;
1803             }
1804             const Operand& sourceOperand =
1805                     controller->mExecutionBuilder->getSourceOperand(outerOperand);
1806             const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1807             CHECK_NE(size, 0u);
1808             std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1809             if (innerBuffer == std::nullopt) {
1810                 // This should never happen.
1811                 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1812                 return ANEURALNETWORKS_OP_FAILED;
1813             }
1814             CHECK_LE(size, innerBuffer->getSize());
1815             CHECK_LE(size, outerBuffer->getSize());
1816             memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1817             outerBuffer->flush();
1818         }
1819         state.iteration = WhileState::kOutsideLoop;
1820     }
1821 
1822     state.stage = WhileState::EVALUATE_CONDITION;
1823     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1824 }
1825 
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1826 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1827                                 std::shared_ptr<StepExecutor>* executor,
1828                                 SharedBurst* burstController,
1829                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1830     VLOG(EXECUTION) << "next: " << *step;
1831     controller->mNextStepIndex = step->gotoStepIndex;
1832     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1833 }
1834 
makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1835 std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
1836         bool reusable, ExecutionBuilder* executionBuilder) const {
1837     auto simpleBody = simple();
1838     auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
1839                                                    simpleBody->mDevice, simpleBody->mPreparedModel,
1840                                                    reusable);
1841     executor->mapInputsAndOutputsTrivially();
1842     return executor;
1843 }
1844 
becomeCompoundIfEmpty()1845 void ExecutionPlan::becomeCompoundIfEmpty() {
1846     CHECK(mState != SIMPLE);
1847     if (mState == EMPTY) {
1848         mBody = new CompoundBody(this);
1849         mState = COMPOUND;
1850     }
1851 }
1852 
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1853 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1854                                                      const std::shared_ptr<Device> device) {
1855     becomeCompoundIfEmpty();
1856     auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1857                                               compound()->mSteps.size(), sourceModelIndex, device);
1858     compound()->mSteps.push_back(step);
1859     return step->executionStep();
1860 }
1861 
createNewIfStep()1862 IfStep* ExecutionPlan::createNewIfStep() {
1863     becomeCompoundIfEmpty();
1864     auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1865     step->ifStep()->index = compound()->mSteps.size();
1866     compound()->mSteps.push_back(step);
1867     return step->ifStep();
1868 }
1869 
createNewWhileStep()1870 WhileStep* ExecutionPlan::createNewWhileStep() {
1871     becomeCompoundIfEmpty();
1872     auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1873     step->whileStep()->index = compound()->mSteps.size();
1874     compound()->mSteps.push_back(step);
1875     return step->whileStep();
1876 }
1877 
createNewGotoStep()1878 GotoStep* ExecutionPlan::createNewGotoStep() {
1879     becomeCompoundIfEmpty();
1880     auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1881     step->gotoStep()->index = compound()->mSteps.size();
1882     compound()->mSteps.push_back(step);
1883     return step->gotoStep();
1884 }
1885 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1886 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1887                                      const ModelBuilder* model) {
1888     CHECK(mState == EMPTY);
1889     mBody = new SimpleBody(device, model, mCacheInfo, mToken);
1890     mState = SIMPLE;
1891 }
1892 
recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1893 void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1894     auto [it, isNew] =
1895             compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1896     CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
1897                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1898 }
1899 
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1900 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1901     auto [it, isNew] =
1902             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1903     CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1904                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1905 }
1906 
dump() const1907 void ExecutionPlan::dump() const {
1908     if (mBody) {
1909         mBody->dump();
1910     } else {
1911         VLOG(COMPILATION) << "EMPTY";
1912     }
1913 }
1914 
reset()1915 void ExecutionPlan::reset() {
1916     if (mBody) {
1917         delete mBody;
1918         mBody = nullptr;
1919     }
1920     mState = EMPTY;
1921 }
1922 
isSimpleCpu() const1923 bool ExecutionPlan::isSimpleCpu() const {
1924     return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1925 }
1926 
forTest_getKind() const1927 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1928     switch (mState) {
1929         case EMPTY:
1930             return Kind::EMPTY;
1931         case SIMPLE:
1932             nnAssert(mBody);
1933             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1934         case COMPOUND:
1935             nnAssert(mBody);
1936             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1937         default:
1938             nnAssert(!"unexpected state");
1939             return Kind::ERROR;
1940     }
1941 }
1942 
forTest_simpleGetDevice() const1943 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1944     return simple()->mDevice;
1945 }
1946 
forTest_compoundGetSteps() const1947 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1948     return compound()->mSteps;
1949 }
1950 
forTest_flatGetDynamicTemporaries() const1951 std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
1952     CHECK_EQ(getSourceModels().size(), size_t(1));
1953     std::set<uint32_t> ret;
1954     forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
1955         ret.insert(dynTemp.second);
1956     });
1957     return ret;
1958 }
1959 
hasDynamicTemporaries() const1960 bool ExecutionPlan::hasDynamicTemporaries() const {
1961     return mBody->hasDynamicTemporaries();
1962 }
1963 
forTest_hasStepModelWithNoInputsOrNoOutputs() const1964 bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const {
1965     return mBody->hasStepModelWithNoInputsOrNoOutputs();
1966 }
1967 
hasStepModelWithNoInputsOrNoOutputs() const1968 bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const {
1969     return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) {
1970         const ExecutionStep* step = logicalStep->tryExecutionStep();
1971         return step != nullptr && step->hasNoInputsOrNoOutputs();
1972     });
1973 }
1974 
forTest_simpleGetCacheToken() const1975 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1976     return simple()->mToken.getCacheToken();
1977 }
1978 
dump() const1979 void ExecutionPlan::SimpleBody::dump() const {
1980     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1981 }
1982 
dump() const1983 void ExecutionPlan::CompoundBody::dump() const {
1984     for (const auto& step : mSteps) {
1985         step->dump();
1986     }
1987 }
1988 
getInputSourceOperand(uint32_t index) const1989 SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const {
1990     const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
1991     CHECK_LT(index, mainModel->inputCount());
1992     const auto operandIndex = mainModel->getInputOperandIndex(index);
1993     return {kMainModelInSourceModels, operandIndex};
1994 }
1995 
getOutputSourceOperand(uint32_t index) const1996 SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const {
1997     const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
1998     CHECK_LT(index, mainModel->outputCount());
1999     const auto operandIndex = mainModel->getOutputOperandIndex(index);
2000     return {kMainModelInSourceModels, operandIndex};
2001 }
2002 
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2003 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
2004                                                        const StepRoleCallback& callback) const {
2005     callback(mPreparedModel.get(), IOType::INPUT, index);
2006 }
2007 
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2008 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
2009                                                         const StepRoleCallback& callback) const {
2010     callback(mPreparedModel.get(), IOType::OUTPUT, index);
2011 }
2012 
2013 // Map an input role of the main model to the input/output roles in the step models.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2014 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
2015                                                          const StepRoleCallback& callback) const {
2016     const auto sourceOperandIndex = mPlan->getInputSourceOperand(index);
2017     forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2018 }
2019 
2020 // Map an output role of the main model to the input/output roles in the step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2021 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
2022                                                           const StepRoleCallback& callback) const {
2023     const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index);
2024     forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2025 }
2026 
forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2027 void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand(
2028         const SourceOperandIndex& index, const StepRoleCallback& callback) const {
2029     const auto it = mSourceOperandToStepRoles.find(index);
2030     if (it == mSourceOperandToStepRoles.end()) return;
2031     for (const auto& [stepIndex, type, ioIndex] : it->second) {
2032         CHECK_LT(stepIndex, mSteps.size());
2033         const auto* step = mSteps[stepIndex]->executionStep();
2034         callback(step->getPreparedStepModel().get(), type, ioIndex);
2035     }
2036 }
2037 
getMemoryPreference(IOType type,uint32_t index) const2038 MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const {
2039     CHECK(mState == SIMPLE || mState == COMPOUND);
2040     if (mState == SIMPLE) {
2041         return simple()->mPreparedModel->getMemoryPreference();
2042     } else {
2043         const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index)
2044                                                               : getOutputSourceOperand(index);
2045         return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
2046     }
2047 }
2048 
getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2049 MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand(
2050         const SourceOperandIndex& index) const {
2051     uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding;
2052     forEachStepRoleOfSourceOperand(
2053             index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) {
2054                 const auto preference = preparedModel->getMemoryPreference();
2055                 alignment = std::max(alignment, preference.alignment);
2056                 padding = std::max(padding, preference.padding);
2057             });
2058     return {alignment, padding};
2059 }
2060 
forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2061 void ExecutionPlan::forEachDynamicTemporary(
2062         const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
2063                 fn) const {
2064     if (mState != COMPOUND) {
2065         return;
2066     }
2067 
2068     for (const auto& logicalStep : compound()->mSteps) {
2069         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
2070             const uint32_t stepIndex = step->getIndex();
2071             const uint32_t sourceModelIndex = step->getSourceModelIndex();
2072             for (const auto& entry : step->getTempsAsStepModelOutputs()) {
2073                 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
2074                 const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
2075                 if (hasUnknownSize(sourceOperand)) {
2076                     fn(sourceOperandIndex, sourceOperand, stepIndex);
2077                 }
2078             }
2079         }
2080     }
2081 }
2082 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,int simulateFailureResultCode) const2083 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
2084                                    uint32_t preference, uint32_t priority,
2085                                    const OptionalTimePoint& deadline, ExecutionPlan* plan,
2086                                    int simulateFailureResultCode) const {
2087     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
2088     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
2089                                                 deadline, plan));
2090     int n = plan->finish(preference, priority, deadline, simulateFailureResultCode);
2091     if (VLOG_IS_ON(COMPILATION)) {
2092         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
2093         logModelToInfo(makeModel());
2094         plan->dump();
2095     }
2096     return n;
2097 }
2098 
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2099 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
2100                                            const std::vector<std::shared_ptr<Device>>& devices,
2101                                            uint32_t preference, uint32_t priority,
2102                                            const OptionalTimePoint& deadline,
2103                                            ExecutionPlan* plan) const {
2104     // This function uses a heuristic approach to partitioning the graph.
2105     // It should be good enough for the first release.
2106 
2107     SourceModels* sourceModels = &plan->getSourceModels();
2108     const size_t deviceCount = devices.size();
2109     const size_t operationCount = mOperations.size();
2110 
2111     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
2112                       << "sourceModelIndex = " << sourceModelIndex << ", "
2113                       << "deviceCount = " << deviceCount << ", "
2114                       << "operationCount = " << operationCount;
2115 
2116     // Figure out where each operation will best execute.
2117     // The value of the vector is the index in the devices vector.
2118     std::vector<int> bestDeviceForOperation(operationCount);
2119     NN_RETURN_IF_ERROR(
2120             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
2121 
2122     // A special value produced by findBestDeviceForEachOperation meaning that
2123     // this is a control flow operation scheduled for interpreted execution
2124     // (see LogicalStep).
2125     const int kControlFlowInterpreter = deviceCount;
2126 
2127     // If one device will run all the operations, we don't need to split the
2128     // work. This shortcut does not apply when recursively partitioning
2129     // referenced models because our plan representation is flat.
2130     if (sourceModelIndex == kMainModelInSourceModels &&
2131         std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
2132                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
2133         const int bestDeviceIndex = bestDeviceForOperation[0];
2134         // Bypass the partitioning process unless the only operation is a
2135         // control flow operation scheduled for interpreted execution.
2136         if (bestDeviceIndex != kControlFlowInterpreter) {
2137             VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
2138                               << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
2139             plan->becomeSingleStep(devices[bestDeviceIndex], this);
2140             return ANEURALNETWORKS_NO_ERROR;
2141         }
2142     }
2143 
2144     // No easy solution, we need to split the work.
2145 
2146     // We keep track of the operations that are ready to run for each device.
2147     // perDeviceQueue[deviceCount] is for interpreted execution of control flow
2148     // (see LogicalStep).
2149     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
2150 
2151     // This helper function produces a device name.
2152     auto deviceName = [&devices, kControlFlowInterpreter,
2153                        deviceCount](int deviceIndex) -> std::string {
2154         if (deviceIndex == kControlFlowInterpreter) {
2155             return "NNAPI";
2156         } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
2157             return "{unknown}";
2158         } else {
2159             return devices.at(deviceIndex)->getName();
2160         }
2161     };
2162 
2163     // This helper function enqueues the operation on the appropriate queue.
2164     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
2165         int deviceIndex = bestDeviceForOperation[operationIndex];
2166         perDeviceQueue[deviceIndex].push(operationIndex);
2167         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
2168                           << deviceIndex << " (" << deviceName(deviceIndex) << ")";
2169     };
2170 
2171     // This helper function finds a device that has operations ready to process.
2172     // We start by looking at the control flow queue, and then look at the
2173     // devices in reverse order (i.e., starting at the end of the devices
2174     // vector). Earlier devices have a chance to prepare more of the inputs
2175     // required by other devices. This function returns -1 if all queues are
2176     // empty.
2177     auto findNextDeviceToProcess = [&]() -> int {
2178         for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
2179             if (!perDeviceQueue[i].empty()) {
2180                 return i;
2181             }
2182         }
2183         return -1;
2184     };
2185 
2186     OperandTracker tracker(this, enqueueOnAppropriateDevice);
2187     // For each iteration of this loop, we'll create either an execution step or
2188     // an interpreted control flow construct (including nested execution steps
2189     // and interpreted control flow constructs).
2190     while (true) {
2191         // Find the device we'll do this step for.
2192         int deviceIndex = findNextDeviceToProcess();
2193         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
2194                           << deviceName(deviceIndex) << ")";
2195         if (deviceIndex < 0) {
2196             break;
2197         }
2198 
2199         // Assign as much as possible to this device.
2200         auto& queue = perDeviceQueue[deviceIndex];
2201         if (deviceIndex != kControlFlowInterpreter) {
2202             ExecutionStep* step =
2203                     plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
2204             while (!queue.empty()) {
2205                 uint32_t operationIndex = queue.front();
2206                 queue.pop();
2207                 int n = step->addOperation(operationIndex);
2208                 if (n != ANEURALNETWORKS_NO_ERROR) {
2209                     LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
2210                     return n;
2211                 }
2212                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2213             }
2214         } else {
2215             while (!queue.empty()) {
2216                 uint32_t operationIndex = queue.front();
2217                 queue.pop();
2218                 const Operation& operation = getOperation(operationIndex);
2219                 if (operation.type == OperationType::IF) {
2220                     namespace op = operation_if;
2221                     const Operand& thenOperand =
2222                             getOperand(operation.inputs[op::kThenModelOperand]);
2223                     const Operand& elseOperand =
2224                             getOperand(operation.inputs[op::kElseModelOperand]);
2225                     const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2226                     const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2227                     uint32_t thenModelIndex = sourceModels->addModel(thenModel);
2228                     uint32_t elseModelIndex = sourceModels->addModel(elseModel);
2229 
2230                     // Emits the following:
2231                     // Index  Step
2232                     //   i    if then=(i + 1) else=(j + 1)
2233                     //  ...   (then model steps)
2234                     //   j    goto k
2235                     //  ...   (else model steps)
2236                     //   k    (steps after the IF)
2237                     IfStep* ifStep = plan->createNewIfStep();
2238                     ifStep->conditionOperandIndex = SourceOperandIndex(
2239                             sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
2240                     ifStep->thenStepIndex = plan->getNextStepIndex();
2241                     NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
2242                             thenModelIndex, devices, preference, priority, deadline, plan));
2243                     GotoStep* afterThenBranch = plan->createNewGotoStep();
2244                     ifStep->elseStepIndex = plan->getNextStepIndex();
2245                     NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
2246                             elseModelIndex, devices, preference, priority, deadline, plan));
2247                     afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
2248 
2249                     // Outer model operands.
2250                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2251                         ifStep->outerInputOperands.emplace_back(sourceModelIndex,
2252                                                                 operation.inputs[i]);
2253                     }
2254                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2255                         ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
2256                                                                  operation.outputs[i]);
2257                     }
2258                     // Then model operands.
2259                     for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
2260                         ifStep->thenBranchInputOperands.emplace_back(
2261                                 thenModelIndex, thenModel->getInputOperandIndex(i));
2262                     }
2263                     for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
2264                         ifStep->thenBranchOutputOperands.emplace_back(
2265                                 thenModelIndex, thenModel->getOutputOperandIndex(i));
2266                     }
2267                     // Else model operands.
2268                     for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
2269                         ifStep->elseBranchInputOperands.emplace_back(
2270                                 elseModelIndex, elseModel->getInputOperandIndex(i));
2271                     }
2272                     for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
2273                         ifStep->elseBranchOutputOperands.emplace_back(
2274                                 elseModelIndex, elseModel->getOutputOperandIndex(i));
2275                     }
2276                 } else if (operation.type == OperationType::WHILE) {
2277                     namespace op = operation_while;
2278                     const Operand& condOperand =
2279                             getOperand(operation.inputs[op::kCondModelOperand]);
2280                     const Operand& bodyOperand =
2281                             getOperand(operation.inputs[op::kBodyModelOperand]);
2282                     const ModelBuilder* condModel = getReferencedModel(condOperand);
2283                     const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2284                     uint32_t condModelIndex = sourceModels->addModel(condModel);
2285                     uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
2286 
2287                     // Emits the following:
2288                     // Index  Step
2289                     //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
2290                     //  ...   (cond model steps)
2291                     //   j    goto i
2292                     //  ...   (body model steps)
2293                     //   k    goto i
2294                     //  ...   (steps after the WHILE)
2295                     //
2296                     //  Note that WhileStep has WhileState associated with it.
2297                     WhileStep* whileStep = plan->createNewWhileStep();
2298                     whileStep->condStepIndex = plan->getNextStepIndex();
2299                     NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
2300                             condModelIndex, devices, preference, priority, deadline, plan));
2301                     GotoStep* afterCond = plan->createNewGotoStep();
2302                     afterCond->gotoStepIndex = whileStep->index;
2303                     whileStep->bodyStepIndex = plan->getNextStepIndex();
2304                     NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
2305                             bodyModelIndex, devices, preference, priority, deadline, plan));
2306                     GotoStep* afterBody = plan->createNewGotoStep();
2307                     afterBody->gotoStepIndex = whileStep->index;
2308                     whileStep->exitStepIndex = plan->getNextStepIndex();
2309 
2310                     // Outer model operands.
2311                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2312                         whileStep->outerInputOperands.emplace_back(sourceModelIndex,
2313                                                                    operation.inputs[i]);
2314                     }
2315                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2316                         whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
2317                                                                     operation.outputs[i]);
2318                     }
2319                     // Cond model operands.
2320                     for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
2321                         whileStep->condInputOperands.emplace_back(
2322                                 condModelIndex, condModel->getInputOperandIndex(i));
2323                     }
2324                     whileStep->condOutputOperand =
2325                             SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
2326                     // Body model operands.
2327                     for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
2328                         whileStep->bodyInputOperands.emplace_back(
2329                                 bodyModelIndex, bodyModel->getInputOperandIndex(i));
2330                     }
2331                     for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
2332                         whileStep->bodyOutputOperands.emplace_back(
2333                                 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
2334                     }
2335                 } else {
2336                     CHECK(false) << operation.type << " is not a control flow operation";
2337                 }
2338                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2339             }
2340         }
2341     }
2342     return ANEURALNETWORKS_NO_ERROR;
2343 }
2344 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2345 float ModelBuilder::getPerformance(uint32_t preference,
2346                                    const std::shared_ptr<Device> device) const {
2347     // Note that we will call this method multiple times per compilation with
2348     // the same arguments if there are nested control flow operations and we
2349     // decide to execute the outer operation on the ExecutionPlan::next()
2350     // interpreter.
2351     //
2352     // This is a potential compilation performance problem. To work around it,
2353     // the performance value could be cached for the duration of a compilation.
2354     float perf = 0;
2355     const size_t operationCount = mOperations.size();
2356     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2357         perf += getPerformance(preference, device, operationIndex);
2358     }
2359     return perf;
2360 }
2361 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2362 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
2363                                    uint32_t operationIndex) const {
2364     auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) {
2365         return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
2366     };
2367 
2368     const Operation& operation = getOperation(operationIndex);
2369 
2370     if (operation.type == OperationType::IF) {
2371         namespace op = operation_if;
2372         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2373         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2374         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2375         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2376         return applyPreference(device->getIfPerformance()) +
2377                0.5 * (thenModel->getPerformance(preference, device) +
2378                       elseModel->getPerformance(preference, device));
2379     }
2380 
2381     if (operation.type == OperationType::WHILE) {
2382         namespace op = operation_while;
2383         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2384         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2385         const ModelBuilder* condModel = getReferencedModel(condOperand);
2386         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2387         return applyPreference(device->getWhilePerformance()) +
2388                condModel->getPerformance(preference, device) +
2389                bodyModel->getPerformance(preference, device);
2390     }
2391 
2392     // TODO This assumes that the type is dictated by the first operand. This is
2393     // currently the case but is not a safe assumption to make in the long term.
2394     const uint32_t operandIndex = operation.inputs[0];
2395     const OperandType operandType = mOperands[operandIndex].type;
2396     switch (operandType) {
2397         case OperandType::FLOAT32:
2398             if (mRelaxComputationFloat32toFloat16) {
2399                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
2400             }
2401             break;
2402         case OperandType::TENSOR_FLOAT32:
2403             if (mRelaxComputationFloat32toFloat16) {
2404                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
2405             }
2406             break;
2407         default:
2408             break;
2409     }
2410 
2411     return applyPreference(device->getPerformance(operandType));
2412 }
2413 
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2414 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
2415     auto containsUnknownSize = [](const ModelBuilder* model,
2416                                   const std::vector<uint32_t>& operandIndexes) {
2417         for (uint32_t operandIndex : operandIndexes) {
2418             if (hasUnknownSize(model->getOperand(operandIndex))) {
2419                 return true;
2420             }
2421         }
2422         return false;
2423     };
2424 
2425     const Operation& operation = getOperation(operationIndex);
2426 
2427     if (operation.type == OperationType::IF) {
2428         namespace op = operation_if;
2429         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2430         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2431         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2432         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2433         return containsUnknownSize(this, operation.inputs) ||
2434                containsUnknownSize(this, operation.outputs) ||
2435                containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
2436                containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
2437                containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
2438                containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
2439     }
2440 
2441     if (operation.type == OperationType::WHILE) {
2442         namespace op = operation_while;
2443         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2444         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2445         const ModelBuilder* condModel = getReferencedModel(condOperand);
2446         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2447         return containsUnknownSize(this, operation.inputs) ||
2448                containsUnknownSize(this, operation.outputs) ||
2449                containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
2450                containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
2451                containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
2452                containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
2453     }
2454 
2455     // Not a control flow operation.
2456     return false;
2457 }
2458 
supportedByControlFlowInterpreter(uint32_t operationIndex) const2459 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
2460     const Operation& operation = getOperation(operationIndex);
2461     return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
2462            // The partitioner does not support dynamic temporaries (b/132458982).
2463            !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
2464 }
2465 
2466 namespace {
2467 
2468 // This class determines whether a given device can execute a given operation
2469 class CanDo {
2470    public:
CanDo()2471     CanDo() {}
2472 
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2473     void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
2474         mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
2475     }
2476 
check(size_t operationIndex) const2477     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
2478 
2479    private:
2480     std::vector<bool> mSupportsOperationByIndex;
2481 };
2482 
2483 }  // anonymous namespace
2484 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2485 int ModelBuilder::findBestDeviceForEachOperation(
2486         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
2487         std::vector<int>* bestDeviceForOperation) const {
2488     const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing());
2489 
2490     const size_t deviceCount = devices.size();
2491     std::vector<CanDo> canDo(deviceCount);
2492     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2493         canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2494     }
2495 
2496     // Figure out the best driver for each operation.
2497     const size_t operationCount = mOperations.size();
2498     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2499         const Operation& operation = getOperation(operationIndex);
2500         // Find which device, including CPU fallback, gives the best performance for this operation.
2501         int bestChoice = -1;
2502 
2503         if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2504             // Do not schedule control flow operations with unknown size to
2505             // non-CPU devices because this is not supported by the 1.3 HAL.
2506             // See http://b/159076604#comment5.
2507             auto cpuDeviceIterator =
2508                     std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2509             if (cpuDeviceIterator != devices.end()) {
2510                 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2511                 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2512                     bestChoice = cpuDeviceIndex;
2513                 }
2514             }
2515         } else {
2516             float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
2517             bool bestIsUpdatable = false;
2518             for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2519                 const auto& device = devices[deviceIndex];
2520                 if (canDo[deviceIndex].check(operationIndex)) {
2521                     const float perfVal = getPerformance(preference, device, operationIndex);
2522                     const bool isUpdatable = device->isUpdatable();
2523                     const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice() ||
2524                                                     (isUpdatable && !bestIsUpdatable));
2525                     if (bestChoice < 0 || perfVal < bestPerfVal ||
2526                         (perfVal == bestPerfVal && deviceIsPreferred)) {
2527                         bestChoice = deviceIndex;
2528                         bestPerfVal = perfVal;
2529                         bestIsUpdatable = isUpdatable;
2530                     }
2531                 } else {
2532                     // Somewhat noisy logging, but only place where the user of NNAPI can get
2533                     // feedback on why an operation was not run on a specific device.
2534                     //
2535                     // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2536                     // very small.
2537                     VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2538                                       << operation.type << ":" << operationIndex;
2539                 }
2540             }
2541         }
2542 
2543         if (bestChoice < 0) {
2544             LOG(ERROR) << "No driver can do operation " << operation.type;
2545             return ANEURALNETWORKS_BAD_DATA;
2546         } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2547                    supportedByControlFlowInterpreter(operationIndex)) {
2548             // Run control flow on the ExecutionPlan::next() interpreter and try
2549             // to delegate referenced models.
2550             const int kControlFlowInterpreter = deviceCount;
2551             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2552             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2553                               << ":" << operationIndex << ") = -1 (NNAPI)";
2554         } else {
2555             (*bestDeviceForOperation)[operationIndex] = bestChoice;
2556             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2557                               << ":" << operationIndex << ") = " << bestChoice << " ("
2558                               << devices[bestChoice]->getName() << ")";
2559         }
2560     }
2561     return ANEURALNETWORKS_NO_ERROR;
2562 }
2563 
2564 }  // namespace nn
2565 }  // namespace android
2566