1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <GraphDump.h>
24 #include <LegacyUtils.h>
25 #include <MetaModel.h>
26 #include <OperationsUtils.h>
27 #include <TokenHasher.h>
28 #include <Tracing.h>
29 #include <fcntl.h>
30 #include <nnapi/IBurst.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33
34 #include <algorithm>
35 #include <functional>
36 #include <map>
37 #include <memory>
38 #include <mutex>
39 #include <queue>
40 #include <set>
41 #include <string>
42 #include <type_traits>
43 #include <unordered_set>
44 #include <utility>
45 #include <vector>
46
47 #include "BurstBuilder.h"
48 #include "CompilationBuilder.h"
49 #include "ExecutionBuilder.h"
50 #include "ExecutionCallback.h"
51 #include "Manager.h"
52 #include "ModelBuilder.h"
53 #include "TypeManager.h"
54
55 namespace android {
56 namespace nn {
57
58 namespace {
59
60 // The index of the main model in SourceModels.
61 constexpr uint32_t kMainModelInSourceModels = 0;
62
63 constexpr uint32_t kNoPadding = 1;
64
65 // Compiles the model on device.
66 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
67 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
68 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
69 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,std::shared_ptr<RuntimePreparedModel> * preparedModel)70 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
71 int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
72 TokenHasher* token, std::shared_ptr<RuntimePreparedModel>* preparedModel) {
73 CHECK(token != nullptr);
74 CHECK(preparedModel != nullptr);
75 *preparedModel = nullptr;
76
77 std::optional<CacheToken> cacheToken;
78 if (device.isCachingSupported() && token->ok() &&
79 token->updateFromString(device.getName().c_str()) &&
80 token->updateFromString(device.getVersionString().c_str()) &&
81 token->update(&executionPreference, sizeof(executionPreference)) &&
82 token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
83 cacheToken = CacheToken{};
84 const uint8_t* tokenPtr = token->getCacheToken();
85 std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin());
86 }
87
88 const ModelFactory makeModel = [&model] { return model.makeModel(); };
89 const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
90 const Priority priority = convertToCanonicalPriority(compilationPriority);
91 const auto [n, returnedPreparedModel] =
92 device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken);
93 *preparedModel = returnedPreparedModel;
94 return n;
95 }
96
97 typedef std::function<void(uint32_t)> OperationReadyCallback;
98
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)99 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
100 const Operand& fromOperand) {
101 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
102 std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) {
103 auto& fromChannelQuant =
104 std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams);
105 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
106 .channelDim = fromChannelQuant.channelDim,
107 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
108 .scales = fromChannelQuant.scales.data(),
109 };
110 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
111 } else if (isExtension(fromOperand.type) &&
112 std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) {
113 auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams);
114 return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
115 extensionData.size());
116 } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) ||
117 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
118 LOG(ERROR) << "Type " << fromOperand.type
119 << " has an unexpected extraParams variant: " << fromOperand.extraParams.index();
120 return ANEURALNETWORKS_BAD_DATA;
121 } else {
122 return ANEURALNETWORKS_NO_ERROR;
123 }
124 }
125
126 // This class tracks whether we know the value of an operand as operations
127 // are processed.
128 class OperandTracker {
129 public:
130 // Creates the tracker for this model. Figure out which operations can be
131 // executed right away and cb for each one of them.
132 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
133 // Mark the specified operation as having been processed. The output
134 // of the operation now being known, this may make new operations to be
135 // able to run. Call cb for each one of them.
136 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
137
138 private:
139 const ModelBuilder* mModel;
140 std::multimap<uint32_t, uint32_t> mOperandToOperations;
141 std::vector<uint32_t> mUnknownInputCount; // For each operation
142 };
143
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)144 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
145 : mModel(model) {
146 const auto& operations = mModel->getOperations();
147 mUnknownInputCount.resize(operations.size());
148 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
149 const Operation& operation = operations[operationIndex];
150 uint32_t count = 0;
151 for (uint32_t operandIndex : operation.inputs) {
152 auto lifetime = mModel->getOperand(operandIndex).lifetime;
153 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
154 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
155 count++;
156 mOperandToOperations.emplace(operandIndex, operationIndex);
157 }
158 }
159 if (count == 0) {
160 cb(operationIndex);
161 }
162 mUnknownInputCount[operationIndex] = count;
163 }
164 }
165
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)166 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
167 // Mark all its outputs as known.
168 const Operation& operation = mModel->getOperations()[operationIndex];
169 for (uint32_t operandIndex : operation.outputs) {
170 auto range = mOperandToOperations.equal_range(operandIndex);
171 for (auto i = range.first; i != range.second; i++) {
172 uint32_t& count = mUnknownInputCount[i->second];
173 if (--count == 0) {
174 cb(i->second);
175 }
176 }
177 }
178 }
179
addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)180 StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size,
181 uint32_t alignment, uint32_t padding) {
182 // TODO: what about overflow?
183 *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment);
184 const uint32_t offset = *totalSizeOfTemporaries;
185 size = roundUp(size, padding);
186 *totalSizeOfTemporaries += size;
187 return {.offset = offset, .paddedLength = size};
188 };
189
toString(SourceOperandIndex sourceOperandIndex)190 std::string toString(SourceOperandIndex sourceOperandIndex) {
191 return "(" + std::to_string(sourceOperandIndex.first) + ", " +
192 std::to_string(sourceOperandIndex.second) + ")";
193 };
194
195 // A helper class to analyze the step roles of all partition boundary operands.
196 //
197 // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer
198 // with the following two methods:
199 // - addRole: Add a step role to a boundary operand
200 // - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest"
201 // operand. All of the step roles of the "dest" operand are also possible step roles of the
202 // "source" operand. This is useful for interpreted control flow, e.g., the outer input operand
203 // of an interpreted IF operation may be directly used as all step roles of the corresponding
204 // input operand of the then and else models. Note that this relationship is directional --
205 // (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a
206 // shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph
207 // produced by the used-by relationship is acyclic. This is true for the partitioner algorithm
208 // because there must be a root operand of each step role for the memory to be allocated on
209 // behalf of.
210 //
211 class StepRoleAnalyzer {
212 public:
analyze(const std::function<void (StepRoleAnalyzer &)> & setup)213 static std::map<SourceOperandIndex, std::set<StepRole>> analyze(
214 const std::function<void(StepRoleAnalyzer&)>& setup) {
215 StepRoleAnalyzer analyzer;
216 setup(analyzer);
217 return analyzer.finish();
218 }
219
addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)220 void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type,
221 uint32_t stepIOIndex) {
222 SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex};
223 mRoles[source].emplace(step.getIndex(), type, stepIOIndex);
224 }
225
setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)226 void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) {
227 mUsedBy[source].emplace(dest);
228 }
229
230 private:
231 StepRoleAnalyzer() = default;
232
233 // Merges the step roles of the destination operands to the source operands
234 // and returns the final map.
finish()235 std::map<SourceOperandIndex, std::set<StepRole>> finish() {
236 for (const auto& [source, _] : mUsedBy) {
237 finishHelper(source);
238 }
239 return std::move(mRoles);
240 }
241
finishHelper(SourceOperandIndex current)242 void finishHelper(SourceOperandIndex current) {
243 if (mProcessedOperands.count(current) > 0) return;
244 mProcessedOperands.insert(current);
245 const auto it = mUsedBy.find(current);
246 if (it != mUsedBy.end()) {
247 auto& roles = mRoles[current];
248 // Merge the step roles of the destination operands.
249 for (const auto& dest : it->second) {
250 finishHelper(dest);
251 const auto& destRoles = mRoles[dest];
252 roles.insert(destRoles.begin(), destRoles.end());
253 }
254 }
255 }
256
257 // A map from the source operand to its step roles.
258 std::map<SourceOperandIndex, std::set<StepRole>> mRoles;
259 // A map from the source operand to a set of destination operands that may directly
260 // use the memory of the source operand.
261 std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy;
262 // Used in finish to track which operand has been processed.
263 std::set<SourceOperandIndex> mProcessedOperands;
264 };
265
266 } // namespace
267
vlogDump(const char * context) const268 void DynamicTemporaries::vlogDump(const char* context) const {
269 if (empty()) {
270 return;
271 }
272 if (context) {
273 VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
274 }
275 for (const auto& temp : mSourceOperandToTemporary) {
276 VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
277 << ", stepIndex = " << temp.second.stepIndex
278 << ", offset = " << temp.second.offset
279 << ", dimensions = " << toString(temp.second.dimensions)
280 << ", paddedLength = " << temp.second.paddedLength
281 << ", alignment = " << temp.second.alignment
282 << ", padding = " << temp.second.padding;
283 }
284 }
285
declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)286 void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
287 const Dimensions& initialDimensions, uint32_t initialLength,
288 uint32_t alignment, uint32_t padding) {
289 VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
290 << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
291 << ", initialDimensions = " << toString(initialDimensions)
292 << ", initialLength = " << initialLength << ", alignment = " << alignment
293 << ", padding = " << padding << ")";
294 CHECK(!mDeclared);
295 CHECK_GT(initialLength, 0u);
296 const uint32_t paddedLength = roundUp(initialLength, padding);
297 auto [_, isNew] = mSourceOperandToTemporary.emplace(
298 sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions,
299 paddedLength, alignment, padding});
300 CHECK(isNew);
301 mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
302 }
303
redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)304 bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
305 const Dimensions& newDimensions, uint32_t newLength) {
306 auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
307 VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
308 << toString(sourceOperandIndex)
309 << ", newDimensions = " << toString(newDimensions)
310 << ", newLength = " << newLength << ") -> " << toString(changedShape);
311 return changedShape;
312 };
313
314 CHECK(mDeclared);
315 CHECK_GT(newLength, 0u);
316
317 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
318 const uint32_t paddedLength = roundUp(newLength, temp.padding);
319 if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) {
320 return createAndLogResult(false);
321 }
322 if (temp.paddedLength < paddedLength) {
323 // Otherwise allocation remains valid, even if it may be suboptimal
324 // (because it uses more space than needed). Use case: Don't force
325 // client to allocate again just because the client reported more
326 // accurate shape information.
327 mAllocatedStepIndexes.erase(temp.stepIndex);
328 }
329 temp.paddedLength = paddedLength;
330 temp.dimensions = newDimensions;
331 return createAndLogResult(true);
332 }
333
allocate(uint32_t stepIndex)334 int DynamicTemporaries::allocate(uint32_t stepIndex) {
335 VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
336
337 CHECK(mDeclared);
338
339 const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
340 if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
341 return ANEURALNETWORKS_NO_ERROR;
342 }
343
344 // perform layout
345 uint32_t newSize = 0;
346 for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) {
347 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
348 // temp.paddedLength is already padded in declare and redeclare.
349 CHECK(temp.paddedLength % temp.padding == 0);
350 temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset;
351 }
352
353 // perform (re-)allocation
354 // TODO: Today we may shrink the allocation in order to avoid wasting memory. Is this important
355 // to conserve memory, or do we waste time reallocating?
356 const double kWaste = 0.2 /* arbitrary */; // Willing to waste space to avoid
357 // deallocation/reallocation overhead
358 auto& memory = mStepIndexToMemory[stepIndex];
359 const uint32_t oldSize = (memory ? memory->getSize() : 0);
360 if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
361 // Suitable allocation already exists; nothing to do
362 } else {
363 int n;
364 std::tie(n, memory) = MemoryAshmem::create(newSize);
365 if (n != ANEURALNETWORKS_NO_ERROR) {
366 LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
367 << " for step " << stepIndex;
368 mAllocatedStepIndexes.erase(stepIndex);
369 return n;
370 }
371 }
372
373 mAllocatedStepIndexes.insert(stepIndex);
374 return ANEURALNETWORKS_NO_ERROR;
375 }
376
allocated(uint32_t stepIndex) const377 bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
378 return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
379 mStepIndexToSourceOperandIndexes.end()) ||
380 mAllocatedStepIndexes.count(stepIndex);
381 }
382
lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const383 std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
384 SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
385 CHECK(mDeclared);
386 if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
387 it != mSourceOperandToTemporary.end()) {
388 const InternalLocationAndShape& temp = it->second;
389 const bool isAllocated = allocated(temp.stepIndex);
390 if (mustBeAllocated) {
391 CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
392 << " must be allocated";
393 }
394 if (isAllocated) {
395 return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
396 &temp.dimensions, temp.paddedLength};
397 } else {
398 return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength};
399 }
400 }
401 return std::nullopt;
402 }
403
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)404 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
405 std::shared_ptr<Device> device)
406 : mPlan(plan),
407 mIndex(stepIndex),
408 mSourceModelIndex(sourceModelIndex),
409 mStepModel(),
410 mDevice(device),
411 mToken(plan->getCacheToken()) {}
412
413 // Adds an operand if it has not been added already.
414 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)415 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
416 OperandKind kind) {
417 // Have we added this operand already?
418 auto i = mOperandMap.find(sourceOperandIndex);
419 if (i != mOperandMap.end()) {
420 CHECK(kind == INPUT);
421 *stepOperandIndex = i->second;
422 return ANEURALNETWORKS_NO_ERROR;
423 }
424
425 // First time we add this operand.
426 *stepOperandIndex = mStepModel.operandCount();
427 mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
428
429 // Add the operand to the step model.
430 const ModelBuilder& sourceModel = *getSourceModel();
431 const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
432 ANeuralNetworksOperandType type = {
433 .type = static_cast<int32_t>(operand.type),
434 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
435 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
436 .scale = operand.scale,
437 .zeroPoint = operand.zeroPoint,
438 };
439
440 int n = mStepModel.addOperand(type);
441 if (n != ANEURALNETWORKS_NO_ERROR) {
442 LOG(ERROR) << "Previous error occurred when partitioning the graph";
443 return n;
444 }
445
446 n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
447 if (n != ANEURALNETWORKS_NO_ERROR) {
448 LOG(ERROR) << "Error when copying extra parameters to the operand";
449 return n;
450 }
451
452 // Sets its value.
453 switch (operand.lifetime) {
454 case Operand::LifeTime::CONSTANT_COPY: {
455 const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
456 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
457 } break;
458 case Operand::LifeTime::CONSTANT_REFERENCE: {
459 const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex];
460 n = mStepModel.setOperandValueFromMemory(
461 *stepOperandIndex, memory, operand.location.offset, operand.location.length);
462 } break;
463 case Operand::LifeTime::NO_VALUE: {
464 n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
465 } break;
466 case Operand::LifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT
467 if (kind == INPUT) {
468 // The first time we've seen this operand is as an
469 // input. That means it must be defined by a
470 // different partition, and is an input to this one.
471 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
472 } else {
473 // The first time we've seen this operand is as an
474 // output. It may be an input to a different
475 // partition, so keep track of it.
476 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
477 mIndex);
478 }
479 } break;
480 case Operand::LifeTime::SUBGRAPH_INPUT: {
481 mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
482 } break;
483 case Operand::LifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE
484 if (kind == INPUT) {
485 // The first time we've seen this operand is as an
486 // input. That means it must be defined by a
487 // different partition, and is an input to this one.
488 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
489 } else {
490 // The first time we've seen this operand is as an
491 // output.
492 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
493 // It may be an input to a different partition, so keep track of
494 // it.
495 mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
496 mIndex);
497 }
498 } break;
499 case Operand::LifeTime::SUBGRAPH: {
500 const ModelBuilder* model = sourceModel.getReferencedModel(operand);
501 n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
502 } break;
503 case Operand::LifeTime::POINTER: {
504 const void* data = std::get<const void*>(operand.location.pointer);
505 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
506 } break;
507 }
508
509 if (n != ANEURALNETWORKS_NO_ERROR) {
510 LOG(ERROR) << "Previous error occurred when partitioning the graph";
511 }
512 return n;
513 }
514
addOperation(int operationIndex)515 int ExecutionStep::addOperation(int operationIndex) {
516 const Operation& operation = getSourceModel()->getOperation(operationIndex);
517 if (mToken.ok()) {
518 mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
519 mToken.update(&operationIndex, sizeof(operationIndex));
520 }
521
522 // Convert the input and output operand indexes.
523 //
524 // We expect operations to be added in topological order. Therefore:
525 //
526 // - We may not have seen an input if it is a model input, a
527 // constant, or an operand written by a different partition.
528 //
529 // - We should not have seen any outputs.
530 auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands,
531 std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
532 const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
533 for (uint32_t i = 0; i < operandCount; i++) {
534 NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
535 }
536 return ANEURALNETWORKS_NO_ERROR;
537 };
538
539 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
540 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
541 std::vector<uint32_t> inputs(inputCount);
542 std::vector<uint32_t> outputs(outputCount);
543 NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
544 NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
545 return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
546 outputCount, outputs.data());
547 }
548
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const549 void ExecutionStep::mapInputsAndOutputs(
550 std::shared_ptr<StepExecutor> executor,
551 const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory,
552 const std::map<SourceOperandIndex, StaticTemporaryLocation>&
553 sourceOperandToLocationOfTemporary,
554 const DynamicTemporaries& dynamicTemporaries,
555 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
556 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
557 const std::map<SourceOperandIndex, ConstantReferenceLocation>&
558 sourceOperandToConstantReference) const {
559 auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
560 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
561 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
562 it != sourceOperandToLocationOfTemporary.end()) {
563 const auto& loc = it->second;
564 executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset,
565 loc.paddedLength);
566 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
567 executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset,
568 loc->paddedLength, *loc->dimensions);
569 } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
570 it != sourceOperandToInputIndex.end()) {
571 executor->mapInput(it->second, stepInputIndex);
572 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
573 it != sourceOperandToOutputIndex.end()) {
574 executor->mapOutputToInput(it->second, stepInputIndex,
575 mainModelOutputShapes
576 ? &mainModelOutputShapes->at(it->second).dimensions
577 : nullptr);
578 } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
579 it != sourceOperandToConstantReference.end()) {
580 // Constant partition boundary operand. This could be an IF branch
581 // model input or a WHILE variable initializer.
582 const auto& loc = it->second;
583 executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length);
584 } else {
585 CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
586 << toString(sourceOperandIndex);
587 }
588 };
589 auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
590 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
591 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
592 it != sourceOperandToLocationOfTemporary.end()) {
593 const auto& loc = it->second;
594 executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset,
595 loc.paddedLength);
596 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
597 executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
598 loc->paddedLength, *loc->dimensions);
599 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
600 it != sourceOperandToOutputIndex.end()) {
601 executor->mapOutput(it->second, stepOutputIndex);
602 } else {
603 CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
604 << toString(sourceOperandIndex);
605 }
606 };
607 for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
608 mapInput(mStepModelInputs[i].first, i);
609 }
610 for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
611 mapOutput(mStepModelOutputs[i].first, i);
612 }
613 }
614
findModelOutputsThatAreDownstreamInputs()615 void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
616 auto declareModelOutputIsDownstreamInput =
617 [this](const SourceOperandIndex& sourceOperandIndex) {
618 const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
619 CHECK(it != mOutputToDefiningExecutionStep.end());
620 uint32_t stepIndex = it->second;
621 CHECK_LT(stepIndex, mSteps.size());
622 VLOG(COMPILATION)
623 << "ExecutionStep(" << stepIndex
624 << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
625 << toString(sourceOperandIndex) << ")";
626 CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
627 mSourceOperandToOutputIndex.end());
628 mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
629 mSourceOperandToOutputIndex.at(sourceOperandIndex));
630 };
631 for (const auto& logicalStep : mSteps) {
632 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
633 for (const auto& output : step->getOutputsAsStepModelInputs()) {
634 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
635 declareModelOutputIsDownstreamInput(sourceOperandIndex);
636 }
637 }
638 }
639 }
640
findTempsAsStepModelOutputs()641 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
642 auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
643 const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
644 if (it == mTemporaryToDefiningExecutionStep.end()) {
645 // The operand is not a temporary or is not defined by an
646 // ExecutionStep (i.e. it's an output of an IF or a WHILE).
647 // The latter case is handled by ExecutionPlan::makeController().
648 return;
649 }
650 uint32_t stepIndex = it->second;
651 CHECK_LT(stepIndex, mSteps.size());
652 mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
653 };
654 for (const auto& logicalStep : mSteps) {
655 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
656 for (const auto& input : step->getTempsAsStepModelInputs()) {
657 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
658 recordAsOutputIfTemporary(sourceOperandIndex);
659 }
660 } else if (const IfStep* step = logicalStep->tryIfStep()) {
661 recordAsOutputIfTemporary(step->conditionOperandIndex);
662 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
663 recordAsOutputIfTemporary(sourceOperandIndex);
664 }
665 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
666 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
667 recordAsOutputIfTemporary(sourceOperandIndex);
668 }
669 } else {
670 CHECK(logicalStep->isGoto());
671 }
672 }
673 }
674
declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)675 void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
676 VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
677 << mainModelOutputIndex << ")";
678 const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
679 mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
680 CHECK(it != mOutputIndexStepModelToMainModel.end());
681 const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
682 CHECK(stepModelOutputIndex < mModelOutputs.size());
683 mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
684 }
685
recordTempAsStepModelOutput(uint32_t stepOperandIndex)686 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
687 const auto it = mOperandMap.find(stepOperandIndex);
688 CHECK(it != mOperandMap.end());
689 mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
690 }
691
getSourceModel() const692 const ModelBuilder* ExecutionStep::getSourceModel() const {
693 return mPlan->getSourceModels().getModel(mSourceModelIndex);
694 }
695
logStepModel() const696 void ExecutionStep::logStepModel() const {
697 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
698
699 auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
700 if (!toLog.empty()) {
701 toLog += ", ";
702 }
703 toLog += toString(e.first);
704 toLog += "->";
705 toLog += toString(e.second);
706 };
707
708 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
709 std::string toLog;
710 for (const auto& e : map) {
711 logRemapEntry(toLog, e);
712 }
713 VLOG(COMPILATION) << name << ": " << toLog;
714 };
715 auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
716 std::string toLog;
717 for (const auto& e : set) {
718 logRemapEntry(toLog, e);
719 }
720 VLOG(COMPILATION) << name << ": " << toLog;
721 };
722
723 logRemapVector("step model inputs", mStepModelInputs);
724 logRemapVector("step model outputs", mStepModelOutputs);
725 logRemapVector("model inputs", mModelInputs);
726 logRemapVector("model outputs", mModelOutputs);
727 logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
728 logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
729 logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
730 }
731
hasUnknownSize(const Operand & operand)732 static bool hasUnknownSize(const Operand& operand) {
733 if (operand.dimensions.empty()) {
734 return TypeManager::get()->isTensorType(operand.type);
735 }
736 for (const Dimension& dimension : operand.dimensions) {
737 if (dimension == 0) {
738 return true;
739 }
740 }
741 return false;
742 }
743
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)744 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
745 int32_t executionPreference, int32_t priority) {
746 CHECK(mDevice != nullptr);
747
748 for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
749 const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
750 if (hasUnknownSize(operand)) {
751 *hasOutputOfUnknownSize = true;
752 VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
753 << " of source graph) has unknown size: " << operand;
754 }
755 }
756
757 mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
758
759 mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
760 mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
761 mTempsAsStepModelInputs.end());
762 mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
763 mOutputsAsStepModelInputs.end());
764
765 mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
766 mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
767 mTempsAsStepModelOutputs.end());
768
769 // A step model with no inputs or no outputs is an invalid model. Note that we would like to
770 // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from
771 // model validation.
772 if (hasNoInputsOrNoOutputs()) {
773 VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs "
774 "or no outputs";
775 return ANEURALNETWORKS_OP_FAILED;
776 }
777
778 if (mSourceModelIndex == kMainModelInSourceModels) {
779 std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
780 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
781 mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
782 }
783 std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
784 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
785 mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
786 }
787
788 // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
789 // mModelInputs being the first inputs, as specified by mStepModelInputs.
790 mInputIndexStepModelToMainModel.resize(mModelInputs.size());
791 std::transform(mModelInputs.begin(), mModelInputs.end(),
792 mInputIndexStepModelToMainModel.begin(),
793 [&mainModelOperandToInputIndex](auto& e) {
794 uint32_t sourceOperandIndex = e.first;
795 return mainModelOperandToInputIndex[sourceOperandIndex];
796 });
797
798 // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
799 // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
800 mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
801 std::transform(mModelOutputs.begin(), mModelOutputs.end(),
802 mOutputIndexStepModelToMainModel.begin(),
803 [&mainModelOperandToOutputIndex](auto& e) {
804 uint32_t sourceOperandIndex = e.first;
805 return mainModelOperandToOutputIndex[sourceOperandIndex];
806 });
807
808 // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
809 // on mOutputsAsStepModelInputs being the first outputs.
810 mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
811 std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
812 mOutputsAsStepModelInputsIndexToMainModel.begin(),
813 [&mainModelOperandToOutputIndex](auto& e) {
814 uint32_t sourceOperandIndex = e.first;
815 return mainModelOperandToOutputIndex[sourceOperandIndex];
816 });
817 }
818
819 if (VLOG_IS_ON(COMPILATION)) {
820 logStepModel();
821 }
822
823 std::vector<uint32_t> inputs(mStepModelInputs.size());
824 std::vector<uint32_t> outputs(mStepModelOutputs.size());
825 std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
826 [](auto& e) { return e.second; });
827 std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
828 [](auto& e) { return e.second; });
829 NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
830 outputs.size(), outputs.data()));
831 NN_RETURN_IF_ERROR(mStepModel.finish());
832
833 // TODO: Move compilation elsewhere?
834 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
835 return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(),
836 &mToken, &mPreparedStepModel);
837 }
838
dump() const839 void ExecutionStep::dump() const {
840 if (VLOG_IS_ON(COMPILATION)) {
841 VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
842 logModelToInfo(mStepModel.makeModel());
843 }
844 }
845
operator <<(std::ostream & os,const IfStep & step)846 std::ostream& operator<<(std::ostream& os, const IfStep& step) {
847 return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
848 << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
849 }
850
operator <<(std::ostream & os,const WhileStep & step)851 std::ostream& operator<<(std::ostream& os, const WhileStep& step) {
852 return os << "Step#" << step.index << ": while cond=" << step.condStepIndex
853 << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
854 }
855
operator <<(std::ostream & os,const GotoStep & step)856 std::ostream& operator<<(std::ostream& os, const GotoStep& step) {
857 return os << "Step#" << step.index << ": goto " << step.gotoStepIndex;
858 }
859
dump() const860 void LogicalStep::dump() const {
861 if (VLOG_IS_ON(COMPILATION)) {
862 if (const IfStep* step = tryIfStep()) {
863 VLOG(COMPILATION) << *step;
864 } else if (const WhileStep* step = tryWhileStep()) {
865 VLOG(COMPILATION) << *step;
866 } else if (const GotoStep* step = tryGotoStep()) {
867 VLOG(COMPILATION) << *step;
868 } else {
869 executionStep()->dump();
870 }
871 }
872 }
873
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)874 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
875 int32_t executionPreference, int32_t priority,
876 const OptionalTimePoint& deadline,
877 int simulateFailureResultCode) {
878 CHECK(!mSuccessfulFinish);
879 CHECK(!deadline.has_value());
880 const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
881
882 auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
883 for (const auto& sourceOperandIndex : operands) {
884 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
885 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
886 if (hasUnknownSize(operand)) {
887 return true;
888 }
889 }
890 return false;
891 };
892
893 findTempsAsStepModelOutputs();
894 for (const auto& logicalStep : mSteps) {
895 if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
896 bool stepHasDynamicTemporaries = false;
897 int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries,
898 executionPreference, priority);
899 if (stepHasDynamicTemporaries) {
900 mHasDynamicTemporaries = true;
901 if (step->getDevice()->getFeatureLevel() < kHalVersionV1_2ToApi.featureLevel) {
902 // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT
903 // must have fully specified dimensions either in the
904 // Operand or in the RequestArgument. In the case of a
905 // dynamic temporary, we won't be able to supply fully
906 // specified dimensions in either.
907 VLOG(COMPILATION)
908 << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex()
909 << " defines dynamic temporaries but is scheduled on pre-1.2 device "
910 << step->getDevice()->getName();
911 if (n == ANEURALNETWORKS_NO_ERROR) {
912 n = ANEURALNETWORKS_OP_FAILED;
913 }
914 }
915 }
916 if (n != ANEURALNETWORKS_NO_ERROR) {
917 VLOG(COMPILATION)
918 << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
919 return n;
920 }
921 } else if (IfStep* step = logicalStep->tryIfStep()) {
922 // The partitioner does not support dynamic temporaries (b/132458982).
923 CHECK(!containsUnknownSize(step->outerInputOperands));
924 CHECK(!containsUnknownSize(step->outerOutputOperands));
925 // step->conditionOperandIndex has a static shape. See b/158557728.
926 CHECK(!containsUnknownSize(step->thenBranchInputOperands));
927 CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
928 CHECK(!containsUnknownSize(step->elseBranchInputOperands));
929 CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
930 } else if (WhileStep* step = logicalStep->tryWhileStep()) {
931 // The partitioner does not support dynamic temporaries (b/132458982).
932 CHECK(!containsUnknownSize(step->outerInputOperands));
933 CHECK(!containsUnknownSize(step->outerOutputOperands));
934 CHECK(!containsUnknownSize(step->condInputOperands));
935 // step->condOutputOperand has a static shape. See b/158557728.
936 CHECK(!containsUnknownSize(step->bodyInputOperands));
937 CHECK(!containsUnknownSize(step->bodyOutputOperands));
938 } else {
939 CHECK(logicalStep->isGoto());
940 }
941 }
942
943 if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
944 VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
945 << simulateFailureResultCode;
946 return simulateFailureResultCode;
947 }
948
949 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
950 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
951 mSourceOperandToInputIndex[index] = i;
952 }
953 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
954 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
955 mSourceOperandToOutputIndex[index] = i;
956 }
957
958 findControlFlowBoundaryConstants(sourceModels);
959 findModelOutputsThatAreDownstreamInputs();
960 findMemoryStepRoles();
961
962 mSuccessfulFinish = true;
963 return ANEURALNETWORKS_NO_ERROR;
964 }
965
findControlFlowBoundaryConstants(const SourceModels * sourceModels)966 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
967 const SourceModels* sourceModels) {
968 auto handleBoundaryConstants = [this,
969 sourceModels](const SourceOperandIndex& sourceOperandIndex) {
970 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
971 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
972 const DataLocation& location = operand.location;
973 if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) {
974 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
975 .buffer = sourceModel->getPointerToOperandValue(location.offset),
976 .length = location.length,
977 };
978 } else if (operand.lifetime == Operand::LifeTime::POINTER) {
979 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
980 .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)),
981 .length = location.length,
982 };
983 } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) {
984 mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
985 .memory = sourceModel->getMemories()[location.poolIndex],
986 .offset = location.offset,
987 .length = location.length,
988 };
989 }
990 };
991 for (const auto& logicalStep : mSteps) {
992 if (const IfStep* step = logicalStep->tryIfStep()) {
993 handleBoundaryConstants(step->conditionOperandIndex);
994 for (const auto& sourceOperandIndex : step->outerInputOperands) {
995 handleBoundaryConstants(sourceOperandIndex);
996 }
997 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
998 for (const auto& sourceOperandIndex : step->outerInputOperands) {
999 handleBoundaryConstants(sourceOperandIndex);
1000 }
1001 }
1002 }
1003 }
1004
findMemoryStepRoles()1005 void ExecutionPlan::CompoundBody::findMemoryStepRoles() {
1006 mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) {
1007 for (const auto& logicalStep : mSteps) {
1008 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1009 const auto& stepModelInputs = step->getStepModelInputs();
1010 for (uint32_t i = 0; i < stepModelInputs.size(); i++) {
1011 const auto& [sourceIndex, stepIndex] = stepModelInputs[i];
1012 analyzer.addRole(*step, sourceIndex, IOType::INPUT, i);
1013 }
1014 const auto& stepModelOutputs = step->getStepModelOutputs();
1015 for (uint32_t i = 0; i < stepModelOutputs.size(); i++) {
1016 const auto& [sourceIndex, stepIndex] = stepModelOutputs[i];
1017 analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i);
1018 }
1019 } else if (const IfStep* step = logicalStep->tryIfStep()) {
1020 // See ExecutionPlan::nextCompound(const IfStep*, ...).
1021 //
1022 // For interpreted IF operation, the outer input memories may be directly used by
1023 // the SUBGRAPH_INPUTs of the then and else model.
1024 CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size());
1025 CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size());
1026 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1027 analyzer.setUsedBy(step->outerInputOperands[i],
1028 step->thenBranchInputOperands[i]);
1029 analyzer.setUsedBy(step->outerInputOperands[i],
1030 step->elseBranchInputOperands[i]);
1031 }
1032 // For interpreted IF operation, the outer output memories may be directly used by
1033 // the SUBGRAPH_OUTPUTs of the then and else model.
1034 CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size());
1035 CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size());
1036 for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) {
1037 analyzer.setUsedBy(step->outerOutputOperands[i],
1038 step->thenBranchOutputOperands[i]);
1039 analyzer.setUsedBy(step->outerOutputOperands[i],
1040 step->elseBranchOutputOperands[i]);
1041 }
1042 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1043 // See ExecutionPlan::nextCompound(const WhileStep*, ...).
1044 //
1045 // For interpreted WHILE operation, the following memories are involved:
1046 // a. the outer input memories to the WHILE operation
1047 // b. the outer output memories to the WHILE operation
1048 // c. the output memory of the condition model
1049 // d. one set of output memories of the body model
1050 // e. another set of output memories of the body model
1051 //
1052 // The memories are used in the following ways:
1053 //
1054 // - Condition model:
1055 // * In the first iteration: inputs use (a); output uses (c)
1056 // * In the following iterations: inputs use (d) or (e) for input-output and
1057 // state-only operands, and (a) for input-only operands; output uses (c)
1058 //
1059 // - Body model:
1060 // * In all iterations: inputs are the same as the condition model; outputs use
1061 // (d) or (e)
1062 //
1063 // Therefore, we configure the analyzer with the following used-by relationships:
1064 // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of
1065 // the condition model for all inputs in the first iteration, as well as the
1066 // input-only operands in the following iterations.
1067 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1068 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1069 analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]);
1070 }
1071 // - The output memories of the body model (d) and (e) may be directly used by the
1072 // SUBGRAPH_INPUTs of the condition model for input-output and state-only operands
1073 // after the first iteration.
1074 CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size());
1075 for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) {
1076 analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]);
1077 }
1078 // - The SUBGRAPH_INPUTs of the condition model are directly used by the
1079 // SUBGRAPH_INPUTs of the body model for all inputs in all iterations.
1080 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1081 for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) {
1082 analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]);
1083 }
1084 } else if (logicalStep->isGoto()) {
1085 // Nothing to do.
1086 } else {
1087 CHECK(false) << "Unexpected LogicalStep kind";
1088 }
1089 }
1090 });
1091 }
1092
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)1093 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
1094 int32_t priority, const OptionalTimePoint& deadline,
1095 int simulateFailureResultCode) {
1096 CHECK(!mSuccessfulFinish);
1097 CHECK(mDevice != nullptr);
1098 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
1099 int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo,
1100 &mToken, &mPreparedModel);
1101 if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
1102 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
1103 << simulateFailureResultCode;
1104 n = simulateFailureResultCode;
1105 }
1106 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
1107 return n;
1108 }
1109
finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,int simulateFailureResultCode)1110 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
1111 const OptionalTimePoint& deadline, int simulateFailureResultCode) {
1112 CHECK(mBody != nullptr);
1113 return mBody->finish(&getSourceModels(), executionPreference, priority, deadline,
1114 simulateFailureResultCode);
1115 }
1116
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1117 ExecutionPlan::Controller::Controller(
1118 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
1119 const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
1120 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary,
1121 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,
1122 std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
1123 std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
1124 const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
1125 std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
1126 DynamicTemporaries dynamicTemporaries)
1127 : mPlan(plan),
1128 mExecutionBuilder(executionBuilder),
1129 mBurstBuilder(burstBuilder),
1130 mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)),
1131 mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)),
1132 mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
1133 mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
1134 mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
1135 mDynamicTemporaries(std::move(dynamicTemporaries)),
1136 mNextStepIndex(0),
1137 mFallbackNextStepIndex(kBadStepIndex),
1138 mLastStepSyncFd(-1) {
1139 if (totalSizeOfTemporaries == 0) {
1140 return;
1141 }
1142 int n;
1143 std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
1144 if (n != ANEURALNETWORKS_NO_ERROR) {
1145 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
1146 mNextStepIndex = kBadStepIndex;
1147 }
1148 for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
1149 memcpy(mTemporaries->getPointer() +
1150 mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset,
1151 location.buffer, location.length);
1152 }
1153 }
1154
1155 // Attempt to create a burst object for each PreparedModel/Partition. If the
1156 // burst controller object cannot be made, return a nullptr in its place to
1157 // indicate the regular execution path should be used. This can occur either
1158 // because PreparedModel was nullptr (cpu was best choice), or because the
1159 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const1160 std::vector<SharedBurst> ExecutionPlan::makeBursts() const {
1161 switch (mState) {
1162 // burst object for each partition in the compound case
1163 case COMPOUND: {
1164 std::vector<SharedBurst> bursts;
1165 bursts.reserve(compound()->mSteps.size());
1166 for (const auto& logicalStep : compound()->mSteps) {
1167 if (!logicalStep->isExecution()) {
1168 bursts.push_back(nullptr);
1169 continue;
1170 }
1171 if (const auto preparedModel =
1172 logicalStep->executionStep()->getPreparedStepModel()) {
1173 const auto maybeBurst = preparedModel->configureExecutionBurst();
1174 if (!maybeBurst.has_value()) {
1175 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1176 << maybeBurst.error().code << ": " << maybeBurst.error().message;
1177 }
1178 bursts.push_back(maybeBurst.value_or(nullptr));
1179 } else {
1180 bursts.push_back(nullptr);
1181 }
1182 }
1183 return bursts;
1184 }
1185 // single burst object for the simple case
1186 case SIMPLE: {
1187 std::vector<SharedBurst> burst;
1188 auto simpleBody = simple();
1189 if (const auto preparedModel = simpleBody->mPreparedModel) {
1190 const auto maybeBurst = preparedModel->configureExecutionBurst();
1191 if (!maybeBurst.has_value()) {
1192 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1193 << maybeBurst.error().code << ": " << maybeBurst.error().message;
1194 }
1195 burst.push_back(maybeBurst.value_or(nullptr));
1196 } else {
1197 burst.push_back(nullptr);
1198 }
1199 return burst;
1200 }
1201 // no burst objects made
1202 default:
1203 return {};
1204 }
1205 }
1206
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1207 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
1208 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
1209 CHECK(isValid());
1210 CHECK(mState != SIMPLE);
1211 const auto* body = compound();
1212 // Create the layout for a RuntimeMemory object big enough to hold
1213 // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
1214 // - buffers required by the control flow implementation.
1215 //
1216 // TODO: Rethink this approach for managing temporaries. Some
1217 // alternatives:
1218 //
1219 // 1) Adopt a memory layout scheme analogous to stack allocation,
1220 // where objects of non-overlapping lifetime can occupy the same
1221 // storage. We would still have a single Memory object in this
1222 // case.
1223 //
1224 // 2) Do something like what CpuExecutor does, and do allocations
1225 // and deallocations on the fly (during execution) before first
1226 // reference and after last reference, respectively. This would
1227 // mean having one Memory object per TEMPORARY; or, in a more
1228 // complicated implementation, one Memory object per set of
1229 // temporaries that have the same lifetime. Note that the Android
1230 // system limits the number of shared memory objects, which are
1231 // what our Memory objects represent.
1232 //
1233 uint32_t totalSizeOfTemporaries = 0;
1234 // This function has two modes of operation:
1235 // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
1236 // TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
1237 // skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
1238 // skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
1239 // operand of another lifetime.
1240 // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
1241 // SUBGRAPH_OUTPUT source operands and panic if we see a source operand
1242 // of another lifetime.
1243 auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries](
1244 const SourceOperandIndex& sourceOperandIndex,
1245 std::map<SourceOperandIndex, StaticTemporaryLocation>*
1246 sourceOperandToLocationOfTemporary,
1247 Operand::LifeTime lifetime =
1248 Operand::LifeTime::TEMPORARY_VARIABLE) {
1249 CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
1250 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT);
1251 const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex);
1252 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE &&
1253 sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
1254 // See the caller for explanation.
1255 return;
1256 }
1257 CHECK_EQ(sourceOperand.lifetime, lifetime);
1258 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1259 if (size != 0u) {
1260 const auto memoryPreference =
1261 body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1262 const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment,
1263 memoryPreference.padding);
1264 auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc);
1265 CHECK(isNew);
1266 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
1267 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1268 } else {
1269 // Unknown size, hence dynamic temporary. The mapping will
1270 // be established elsewhere (DynamicTemporaries::allocate()).
1271 CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1272 CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1273 }
1274 };
1275 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary;
1276 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2;
1277 for (const auto& logicalStep : body->mSteps) {
1278 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1279 // Allocate memory for ExecutionStep temporary outputs that are
1280 // inputs to other steps, as determined by
1281 // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
1282 //
1283 // We don't allocate memory for step model output operands with
1284 // source operand lifetime SUBGRAPH_OUTPUT because they will be
1285 // - managed by the client (main model outputs),
1286 // - assigned a location of another operand (when this step model
1287 // output is a branch model output of an IF; see
1288 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1289 // - allocated by a WHILE (when this step model output
1290 // is a condition or body model output of a WHILE; see the
1291 // step->bodyOutputOperands and step->condOutputOperand handling
1292 // below).
1293 for (const auto& output : step->getTempsAsStepModelOutputs()) {
1294 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
1295 &sourceOperandToLocationOfTemporary);
1296 }
1297 } else if (const IfStep* step = logicalStep->tryIfStep()) {
1298 // Allocate memory for all temporary outputs of an IfStep because
1299 // they are going to be written to by a branch model. We don't
1300 // perform unused output operand optimisation for referenced models.
1301 //
1302 // We don't allocate memory for branch output operands because they
1303 // use the same location as the corresponding outer output operands,
1304 // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
1305 //
1306 // We don't allocate memory for outer output operands with source
1307 // operand lifetime SUBGRAPH_OUTPUT because they will be
1308 // - managed by the client (main model outputs),
1309 // - assigned a location of another operand (when this IF outer
1310 // output is a branch model output of another IF; see
1311 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1312 // - allocated by a WHILE (when this IF outer output
1313 // is a condition or body model output of a WHILE; see the
1314 // step->bodyOutputOperands and step->condOutputOperand handling
1315 // below).
1316 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1317 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1318 }
1319 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1320 // Allocate memory for all temporary outputs of an WhileStep because
1321 // they are going to be written to by the WHILE loop.
1322 //
1323 // We don't allocate memory for outer output operands with source
1324 // operand lifetime SUBGRAPH_OUTPUT because they will be
1325 // - managed by the client (main model outputs),
1326 // - assigned a location of another operand (when this WHILE outer
1327 // output is a branch model output of an IF; see
1328 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1329 // - allocated by another WHILE (when this WHILE outer output
1330 // is a condition or body model output of another WHILE; see the
1331 // step->bodyOutputOperands and step->condOutputOperand handling
1332 // below).
1333 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1334 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1335 }
1336 // Allocate memory for body model outputs. Note that we could use
1337 // the outer output operand memory instead but we currently don't do
1338 // so (b/148206073).
1339 for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
1340 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary,
1341 Operand::LifeTime::SUBGRAPH_OUTPUT);
1342 // Allocate another set of temporaries for double buffering.
1343 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2,
1344 Operand::LifeTime::SUBGRAPH_OUTPUT);
1345 }
1346 // Allocate memory for condition model output.
1347 // TODO: Share one condition output memory region between all loops.
1348 mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary,
1349 Operand::LifeTime::SUBGRAPH_OUTPUT);
1350 } else {
1351 CHECK(logicalStep->isGoto());
1352 }
1353 }
1354 // Allocate temporary memory for boundary CONSTANT_COPY operands.
1355 for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) {
1356 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1357 const auto loc = addTemporary(&totalSizeOfTemporaries, location.length,
1358 memoryPreference.alignment, memoryPreference.padding);
1359 sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc);
1360 VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
1361 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1362 }
1363 // Collect dynamic temporaries.
1364 // TODO(b/157236079): Move some or all of this work to compilation time?
1365 DynamicTemporaries dynamicTemporaries;
1366 const TypeManager* typeManager = TypeManager::get();
1367 forEachDynamicTemporary([body, typeManager, &dynamicTemporaries](
1368 SourceOperandIndex sourceOperandIndex,
1369 const Operand& sourceOperand, uint32_t definingStepIndex) {
1370 CHECK(typeManager->isTensorType(sourceOperand.type));
1371 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1372 // TODO: For now we guess an initial size equal to element
1373 // size, which is overly conservative.
1374 const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
1375 dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
1376 size, memoryPreference.alignment, memoryPreference.padding);
1377 });
1378 dynamicTemporaries.endDeclarations();
1379 dynamicTemporaries.vlogDump("finished declarations");
1380
1381 return std::shared_ptr<Controller>(new Controller(
1382 this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
1383 std::move(sourceOperandToLocationOfTemporary),
1384 std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex,
1385 body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy,
1386 body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
1387 }
1388
1389 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1390 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
1391 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1392 const std::vector<OutputShape>* mainModelOutputShapes) const {
1393 *executor = nullptr;
1394 if (burstController != nullptr) {
1395 *burstController = nullptr;
1396 }
1397
1398 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
1399 << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
1400
1401 if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
1402 // We haven't called next().
1403 return ANEURALNETWORKS_OP_FAILED;
1404 }
1405
1406 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1407 // The last call to next() did not produce an executor.
1408 return ANEURALNETWORKS_OP_FAILED;
1409 }
1410
1411 controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1412 return next(controller, executor, burstController, mainModelOutputShapes);
1413 }
1414
Buffer(void * pointer,uint32_t size)1415 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1416 : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)),
1417 mOffset(0) {}
1418
Buffer(RunTimePoolInfo info,uint32_t offset)1419 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1420 : mInfo(std::move(info)), mOffset(offset) {}
1421
getPointer() const1422 void* ExecutionPlan::Buffer::getPointer() const {
1423 return mInfo.getBuffer() + mOffset;
1424 }
1425
getSize() const1426 uint32_t ExecutionPlan::Buffer::getSize() const {
1427 return mInfo.getSize() - mOffset;
1428 }
1429
flush() const1430 void ExecutionPlan::Buffer::flush() const {
1431 mInfo.flush();
1432 }
1433
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1434 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1435 const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1436 switch (info.state()) {
1437 case ModelArgumentInfo::POINTER: {
1438 return Buffer(info.buffer(), info.length());
1439 } break;
1440 case ModelArgumentInfo::MEMORY: {
1441 if (std::optional<RunTimePoolInfo> poolInfo =
1442 executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1443 return Buffer(*poolInfo, info.locationAndLength().offset);
1444 } else {
1445 LOG(ERROR) << "Unable to map operand memory pool";
1446 return std::nullopt;
1447 }
1448 } break;
1449 case ModelArgumentInfo::HAS_NO_VALUE: {
1450 LOG(ERROR) << "Attempting to read an operand that has no value";
1451 return std::nullopt;
1452 } break;
1453 default: {
1454 LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1455 return std::nullopt;
1456 } break;
1457 }
1458 }
1459
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1460 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1461 std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1462 const auto& sourceOperandToLocationOfTemporary =
1463 controller->mSourceOperandToLocationOfTemporary;
1464 const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1465 const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1466 const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1467 if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex);
1468 it != sourceOperandToLocationOfTemporary.end()) {
1469 const uint32_t offset = it->second.offset;
1470 const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1471 return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1472 } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1473 it != sourceOperandToInputIndex.end()) {
1474 const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1475 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1476 } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1477 it != sourceOperandToOutputIndex.end()) {
1478 const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1479 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1480 } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1481 it != sourceOperandToConstantReference.end()) {
1482 const ConstantReferenceLocation& location = it->second;
1483 const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1484 if (info == std::nullopt) {
1485 return std::nullopt;
1486 }
1487 return Buffer(info->getBuffer() + location.offset, location.length);
1488 }
1489 return std::nullopt;
1490 }
1491
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1492 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1493 SourceOperandIndex operandIndex, bool* value) const {
1494 std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1495 if (buffer == std::nullopt) {
1496 LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1497 return ANEURALNETWORKS_OP_FAILED;
1498 }
1499 CHECK_GE(buffer->getSize(), sizeof(bool8));
1500 bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1501 *value = static_cast<bool>(value8);
1502 VLOG(EXECUTION) << "readConditionValue: " << *value;
1503 return ANEURALNETWORKS_NO_ERROR;
1504 }
1505
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1506 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1507 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1508 const std::vector<OutputShape>* mainModelOutputShapes,
1509 int syncFdOfLastStep) const {
1510 CHECK(mState == COMPOUND);
1511
1512 controller->mLastStepSyncFd = syncFdOfLastStep;
1513 *executor = nullptr;
1514 if (burstController != nullptr) {
1515 *burstController = nullptr;
1516 }
1517
1518 VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1519 << "): mNextStepIndex = " << controller->mNextStepIndex;
1520
1521 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1522 return ANEURALNETWORKS_OP_FAILED;
1523 }
1524
1525 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1526 }
1527
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1528 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1529 std::shared_ptr<StepExecutor>* executor,
1530 SharedBurst* burstController,
1531 const std::vector<OutputShape>* mainModelOutputShapes) const {
1532 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1533 return ANEURALNETWORKS_OP_FAILED;
1534 }
1535
1536 auto compoundBody = compound();
1537 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1538 controller->mNextStepIndex = Controller::kBadStepIndex; // end
1539 return ANEURALNETWORKS_NO_ERROR;
1540 }
1541
1542 const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1543 if (const IfStep* step = logicalStep->tryIfStep()) {
1544 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1545 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1546 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1547 } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1548 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1549 } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1550 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1551 } else {
1552 CHECK(false) << "Unknown step variant";
1553 return ANEURALNETWORKS_BAD_STATE;
1554 }
1555 }
1556
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1557 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1558 std::shared_ptr<StepExecutor>* executor,
1559 SharedBurst* burstController,
1560 const std::vector<OutputShape>* mainModelOutputShapes) const {
1561 VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1562 << step->getDevice()->getName();
1563
1564 NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
1565 controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
1566
1567 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1568 step->getDevice(), step->getPreparedStepModel(),
1569 /*reusable=*/false, step,
1570 &controller->mDynamicTemporaries);
1571
1572 step->mapInputsAndOutputs(
1573 *executor, mainModelOutputShapes, controller->mTemporaries.get(),
1574 controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries,
1575 controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
1576 controller->mSourceOperandToConstantReference);
1577 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1578 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1579 }
1580
1581 controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1582 controller->mNextStepIndex++;
1583 return ANEURALNETWORKS_NO_ERROR;
1584 }
1585
1586 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1587 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1588 const SourceOperandIndex& innerOperand) {
1589 VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1590 << toString(outerOperand);
1591 #ifdef NN_DEBUGGABLE
1592 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1593 mSourceOperandToInputIndex.count(innerOperand) +
1594 mSourceOperandToOutputIndex.count(innerOperand) +
1595 mSourceOperandToConstantReference.count(innerOperand),
1596 1u);
1597 #endif
1598 mSourceOperandToLocationOfTemporary.erase(innerOperand);
1599 mSourceOperandToInputIndex.erase(innerOperand);
1600 mSourceOperandToOutputIndex.erase(innerOperand);
1601 mSourceOperandToConstantReference.erase(innerOperand);
1602 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1603 it != mSourceOperandToLocationOfTemporary.end()) {
1604 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1605 } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1606 it != mSourceOperandToInputIndex.end()) {
1607 mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1608 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1609 it != mSourceOperandToOutputIndex.end()) {
1610 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1611 } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1612 it != mSourceOperandToConstantReference.end()) {
1613 mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1614 } else {
1615 CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1616 << " from operand " << toString(outerOperand);
1617 }
1618 }
1619
1620 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1621 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1622 const SourceOperandIndex& innerOperand) {
1623 VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1624 << toString(outerOperand);
1625 #ifdef NN_DEBUGGABLE
1626 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1627 mSourceOperandToOutputIndex.count(innerOperand),
1628 1u);
1629 #endif
1630 mSourceOperandToLocationOfTemporary.erase(innerOperand);
1631 mSourceOperandToOutputIndex.erase(innerOperand);
1632 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1633 it != mSourceOperandToLocationOfTemporary.end()) {
1634 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1635 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1636 it != mSourceOperandToOutputIndex.end()) {
1637 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1638 } else {
1639 CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1640 << " from operand " << toString(outerOperand);
1641 }
1642 }
1643
waitForLastStepSyncFence() const1644 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1645 if (mLastStepSyncFd == -1) {
1646 return ANEURALNETWORKS_NO_ERROR;
1647 }
1648 VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1649 auto r = syncWait(mLastStepSyncFd, -1);
1650 int n = ANEURALNETWORKS_NO_ERROR;
1651 if (r != FenceState::SIGNALED) {
1652 LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1653 n = ANEURALNETWORKS_OP_FAILED;
1654 }
1655 return n;
1656 }
1657
1658 // Invocations of Controller::setInput/setOutput in this function must match with invocations of
1659 // StepRoleAnalyzer::setUsedBy in the IfStep branch in
1660 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1661 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1662 std::shared_ptr<StepExecutor>* executor,
1663 SharedBurst* burstController,
1664 const std::vector<OutputShape>* mainModelOutputShapes) const {
1665 VLOG(EXECUTION) << "next: " << *step;
1666 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1667 // This is safe because the steps are serialized when doing fenced compute.
1668 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1669 bool condValue;
1670 NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1671 controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1672 const std::vector<SourceOperandIndex>& branchInputOperands =
1673 condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1674 const std::vector<SourceOperandIndex>& branchOutputOperands =
1675 condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1676 CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1677 CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1678 for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1679 // We have to do this assignment just before executing this step to
1680 // accommodate cases when the IF resides within a WHILE condition or
1681 // body model and for some j the i-th input of the IF branch model is
1682 // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1683 // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1684 // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1685 // In such cases, the WhileStep modifies the location of
1686 // step->outerInputOperands[i] to implement double buffering.
1687 controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1688 }
1689 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1690 // We have to do this assignment just before executing this step to
1691 // accommodate the case when the IF resides within a WHILE body
1692 // model and the i-th output of the IF branch model is an
1693 // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1694 // some j). In that case, the WhileStep modifies the location of
1695 // step->outerOutputOperands[i] to implement double buffering.
1696 controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1697 }
1698 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1699 }
1700
1701 // Invocations of Controller::setInput in this function must match with invocations of
1702 // StepRoleAnalyzer::setUsedBy in the WhileStep branch in
1703 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1704 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1705 std::shared_ptr<StepExecutor>* executor,
1706 SharedBurst* burstController,
1707 const std::vector<OutputShape>* mainModelOutputShapes) const {
1708 WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1709 if (state.stage == WhileState::EVALUATE_CONDITION) {
1710 state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1711 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1712 << ": evaluating condition";
1713 controller->mNextStepIndex = step->condStepIndex;
1714
1715 if (state.iteration == 0) {
1716 state.startTime = Clock::now();
1717 }
1718
1719 // iteration = 0 cond inputs = outer inputs
1720 // iteration = 1 cond inputs = body outputs
1721 // iteration = 2 cond inputs = body outputs
1722 // iteration = 3 cond inputs = ...
1723 uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1724 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1725 CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1726 for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1727 bool operandIsInputOnly = i >= loopBodyOutputCount;
1728 controller->setInput((state.iteration == 0 || operandIsInputOnly)
1729 ? step->outerInputOperands[i]
1730 : step->bodyOutputOperands[i],
1731 step->condInputOperands[i]);
1732 }
1733
1734 state.stage = WhileState::EVALUATE_BODY;
1735 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1736 }
1737
1738 CHECK(state.stage == WhileState::EVALUATE_BODY);
1739 std::chrono::nanoseconds timeoutDuration(
1740 controller->mExecutionBuilder->getLoopTimeoutDuration());
1741 auto duration = Clock::now() - state.startTime;
1742 if (duration > timeoutDuration) {
1743 LOG(ERROR) << "WHILE loop timed out after "
1744 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1745 << " ms";
1746 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1747 }
1748
1749 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1750 // This is safe because the steps are serialized when doing fenced compute.
1751 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1752 bool condValue;
1753 NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1754 if (condValue) {
1755 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1756 << ": evaluating body";
1757 controller->mNextStepIndex = step->bodyStepIndex;
1758
1759 // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1
1760 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2
1761 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1
1762 // iteration = 3 body inputs = cond inputs = ... body outputs = ...
1763 #ifdef NN_DEBUGGABLE
1764 CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1765 CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1766 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1767 CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1768 #endif
1769 for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1770 controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1771 }
1772 if (state.iteration != 0) {
1773 for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1774 #ifdef NN_DEBUGGABLE
1775 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1776 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1777 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u);
1778 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u);
1779 #endif
1780 std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand],
1781 controller->mSourceOperandToLocationOfTemporary2[outputOperand]);
1782 }
1783 }
1784 } else {
1785 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1786 << ": exiting loop";
1787 controller->mNextStepIndex = step->exitStepIndex;
1788
1789 // Copy body outputs to outer outputs.
1790 // TODO: Use outer outputs instead of tmp2 to avoid copying?
1791 CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1792 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1793 // condInputOperands[i] points to a body output operand from the
1794 // last iteration if we've executed at least one iteration and to a
1795 // WHILE operation input operand otherwise.
1796 const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1797 const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1798 std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1799 if (outerBuffer == std::nullopt) {
1800 // This should never happen.
1801 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1802 return ANEURALNETWORKS_OP_FAILED;
1803 }
1804 const Operand& sourceOperand =
1805 controller->mExecutionBuilder->getSourceOperand(outerOperand);
1806 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1807 CHECK_NE(size, 0u);
1808 std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1809 if (innerBuffer == std::nullopt) {
1810 // This should never happen.
1811 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1812 return ANEURALNETWORKS_OP_FAILED;
1813 }
1814 CHECK_LE(size, innerBuffer->getSize());
1815 CHECK_LE(size, outerBuffer->getSize());
1816 memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1817 outerBuffer->flush();
1818 }
1819 state.iteration = WhileState::kOutsideLoop;
1820 }
1821
1822 state.stage = WhileState::EVALUATE_CONDITION;
1823 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1824 }
1825
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1826 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1827 std::shared_ptr<StepExecutor>* executor,
1828 SharedBurst* burstController,
1829 const std::vector<OutputShape>* mainModelOutputShapes) const {
1830 VLOG(EXECUTION) << "next: " << *step;
1831 controller->mNextStepIndex = step->gotoStepIndex;
1832 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1833 }
1834
makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1835 std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
1836 bool reusable, ExecutionBuilder* executionBuilder) const {
1837 auto simpleBody = simple();
1838 auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
1839 simpleBody->mDevice, simpleBody->mPreparedModel,
1840 reusable);
1841 executor->mapInputsAndOutputsTrivially();
1842 return executor;
1843 }
1844
becomeCompoundIfEmpty()1845 void ExecutionPlan::becomeCompoundIfEmpty() {
1846 CHECK(mState != SIMPLE);
1847 if (mState == EMPTY) {
1848 mBody = new CompoundBody(this);
1849 mState = COMPOUND;
1850 }
1851 }
1852
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1853 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1854 const std::shared_ptr<Device> device) {
1855 becomeCompoundIfEmpty();
1856 auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1857 compound()->mSteps.size(), sourceModelIndex, device);
1858 compound()->mSteps.push_back(step);
1859 return step->executionStep();
1860 }
1861
createNewIfStep()1862 IfStep* ExecutionPlan::createNewIfStep() {
1863 becomeCompoundIfEmpty();
1864 auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1865 step->ifStep()->index = compound()->mSteps.size();
1866 compound()->mSteps.push_back(step);
1867 return step->ifStep();
1868 }
1869
createNewWhileStep()1870 WhileStep* ExecutionPlan::createNewWhileStep() {
1871 becomeCompoundIfEmpty();
1872 auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1873 step->whileStep()->index = compound()->mSteps.size();
1874 compound()->mSteps.push_back(step);
1875 return step->whileStep();
1876 }
1877
createNewGotoStep()1878 GotoStep* ExecutionPlan::createNewGotoStep() {
1879 becomeCompoundIfEmpty();
1880 auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1881 step->gotoStep()->index = compound()->mSteps.size();
1882 compound()->mSteps.push_back(step);
1883 return step->gotoStep();
1884 }
1885
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1886 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1887 const ModelBuilder* model) {
1888 CHECK(mState == EMPTY);
1889 mBody = new SimpleBody(device, model, mCacheInfo, mToken);
1890 mState = SIMPLE;
1891 }
1892
recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1893 void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1894 auto [it, isNew] =
1895 compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1896 CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
1897 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1898 }
1899
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1900 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1901 auto [it, isNew] =
1902 compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1903 CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1904 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1905 }
1906
dump() const1907 void ExecutionPlan::dump() const {
1908 if (mBody) {
1909 mBody->dump();
1910 } else {
1911 VLOG(COMPILATION) << "EMPTY";
1912 }
1913 }
1914
reset()1915 void ExecutionPlan::reset() {
1916 if (mBody) {
1917 delete mBody;
1918 mBody = nullptr;
1919 }
1920 mState = EMPTY;
1921 }
1922
isSimpleCpu() const1923 bool ExecutionPlan::isSimpleCpu() const {
1924 return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1925 }
1926
forTest_getKind() const1927 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1928 switch (mState) {
1929 case EMPTY:
1930 return Kind::EMPTY;
1931 case SIMPLE:
1932 nnAssert(mBody);
1933 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1934 case COMPOUND:
1935 nnAssert(mBody);
1936 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1937 default:
1938 nnAssert(!"unexpected state");
1939 return Kind::ERROR;
1940 }
1941 }
1942
forTest_simpleGetDevice() const1943 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1944 return simple()->mDevice;
1945 }
1946
forTest_compoundGetSteps() const1947 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1948 return compound()->mSteps;
1949 }
1950
forTest_flatGetDynamicTemporaries() const1951 std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
1952 CHECK_EQ(getSourceModels().size(), size_t(1));
1953 std::set<uint32_t> ret;
1954 forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
1955 ret.insert(dynTemp.second);
1956 });
1957 return ret;
1958 }
1959
hasDynamicTemporaries() const1960 bool ExecutionPlan::hasDynamicTemporaries() const {
1961 return mBody->hasDynamicTemporaries();
1962 }
1963
forTest_hasStepModelWithNoInputsOrNoOutputs() const1964 bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const {
1965 return mBody->hasStepModelWithNoInputsOrNoOutputs();
1966 }
1967
hasStepModelWithNoInputsOrNoOutputs() const1968 bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const {
1969 return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) {
1970 const ExecutionStep* step = logicalStep->tryExecutionStep();
1971 return step != nullptr && step->hasNoInputsOrNoOutputs();
1972 });
1973 }
1974
forTest_simpleGetCacheToken() const1975 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1976 return simple()->mToken.getCacheToken();
1977 }
1978
dump() const1979 void ExecutionPlan::SimpleBody::dump() const {
1980 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1981 }
1982
dump() const1983 void ExecutionPlan::CompoundBody::dump() const {
1984 for (const auto& step : mSteps) {
1985 step->dump();
1986 }
1987 }
1988
getInputSourceOperand(uint32_t index) const1989 SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const {
1990 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
1991 CHECK_LT(index, mainModel->inputCount());
1992 const auto operandIndex = mainModel->getInputOperandIndex(index);
1993 return {kMainModelInSourceModels, operandIndex};
1994 }
1995
getOutputSourceOperand(uint32_t index) const1996 SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const {
1997 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
1998 CHECK_LT(index, mainModel->outputCount());
1999 const auto operandIndex = mainModel->getOutputOperandIndex(index);
2000 return {kMainModelInSourceModels, operandIndex};
2001 }
2002
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2003 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
2004 const StepRoleCallback& callback) const {
2005 callback(mPreparedModel.get(), IOType::INPUT, index);
2006 }
2007
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2008 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
2009 const StepRoleCallback& callback) const {
2010 callback(mPreparedModel.get(), IOType::OUTPUT, index);
2011 }
2012
2013 // Map an input role of the main model to the input/output roles in the step models.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2014 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
2015 const StepRoleCallback& callback) const {
2016 const auto sourceOperandIndex = mPlan->getInputSourceOperand(index);
2017 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2018 }
2019
2020 // Map an output role of the main model to the input/output roles in the step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2021 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
2022 const StepRoleCallback& callback) const {
2023 const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index);
2024 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2025 }
2026
forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2027 void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand(
2028 const SourceOperandIndex& index, const StepRoleCallback& callback) const {
2029 const auto it = mSourceOperandToStepRoles.find(index);
2030 if (it == mSourceOperandToStepRoles.end()) return;
2031 for (const auto& [stepIndex, type, ioIndex] : it->second) {
2032 CHECK_LT(stepIndex, mSteps.size());
2033 const auto* step = mSteps[stepIndex]->executionStep();
2034 callback(step->getPreparedStepModel().get(), type, ioIndex);
2035 }
2036 }
2037
getMemoryPreference(IOType type,uint32_t index) const2038 MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const {
2039 CHECK(mState == SIMPLE || mState == COMPOUND);
2040 if (mState == SIMPLE) {
2041 return simple()->mPreparedModel->getMemoryPreference();
2042 } else {
2043 const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index)
2044 : getOutputSourceOperand(index);
2045 return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
2046 }
2047 }
2048
getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2049 MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand(
2050 const SourceOperandIndex& index) const {
2051 uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding;
2052 forEachStepRoleOfSourceOperand(
2053 index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) {
2054 const auto preference = preparedModel->getMemoryPreference();
2055 alignment = std::max(alignment, preference.alignment);
2056 padding = std::max(padding, preference.padding);
2057 });
2058 return {alignment, padding};
2059 }
2060
forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2061 void ExecutionPlan::forEachDynamicTemporary(
2062 const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
2063 fn) const {
2064 if (mState != COMPOUND) {
2065 return;
2066 }
2067
2068 for (const auto& logicalStep : compound()->mSteps) {
2069 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
2070 const uint32_t stepIndex = step->getIndex();
2071 const uint32_t sourceModelIndex = step->getSourceModelIndex();
2072 for (const auto& entry : step->getTempsAsStepModelOutputs()) {
2073 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
2074 const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
2075 if (hasUnknownSize(sourceOperand)) {
2076 fn(sourceOperandIndex, sourceOperand, stepIndex);
2077 }
2078 }
2079 }
2080 }
2081 }
2082
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,int simulateFailureResultCode) const2083 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
2084 uint32_t preference, uint32_t priority,
2085 const OptionalTimePoint& deadline, ExecutionPlan* plan,
2086 int simulateFailureResultCode) const {
2087 uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
2088 NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
2089 deadline, plan));
2090 int n = plan->finish(preference, priority, deadline, simulateFailureResultCode);
2091 if (VLOG_IS_ON(COMPILATION)) {
2092 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
2093 logModelToInfo(makeModel());
2094 plan->dump();
2095 }
2096 return n;
2097 }
2098
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2099 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
2100 const std::vector<std::shared_ptr<Device>>& devices,
2101 uint32_t preference, uint32_t priority,
2102 const OptionalTimePoint& deadline,
2103 ExecutionPlan* plan) const {
2104 // This function uses a heuristic approach to partitioning the graph.
2105 // It should be good enough for the first release.
2106
2107 SourceModels* sourceModels = &plan->getSourceModels();
2108 const size_t deviceCount = devices.size();
2109 const size_t operationCount = mOperations.size();
2110
2111 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
2112 << "sourceModelIndex = " << sourceModelIndex << ", "
2113 << "deviceCount = " << deviceCount << ", "
2114 << "operationCount = " << operationCount;
2115
2116 // Figure out where each operation will best execute.
2117 // The value of the vector is the index in the devices vector.
2118 std::vector<int> bestDeviceForOperation(operationCount);
2119 NN_RETURN_IF_ERROR(
2120 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
2121
2122 // A special value produced by findBestDeviceForEachOperation meaning that
2123 // this is a control flow operation scheduled for interpreted execution
2124 // (see LogicalStep).
2125 const int kControlFlowInterpreter = deviceCount;
2126
2127 // If one device will run all the operations, we don't need to split the
2128 // work. This shortcut does not apply when recursively partitioning
2129 // referenced models because our plan representation is flat.
2130 if (sourceModelIndex == kMainModelInSourceModels &&
2131 std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
2132 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
2133 const int bestDeviceIndex = bestDeviceForOperation[0];
2134 // Bypass the partitioning process unless the only operation is a
2135 // control flow operation scheduled for interpreted execution.
2136 if (bestDeviceIndex != kControlFlowInterpreter) {
2137 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
2138 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
2139 plan->becomeSingleStep(devices[bestDeviceIndex], this);
2140 return ANEURALNETWORKS_NO_ERROR;
2141 }
2142 }
2143
2144 // No easy solution, we need to split the work.
2145
2146 // We keep track of the operations that are ready to run for each device.
2147 // perDeviceQueue[deviceCount] is for interpreted execution of control flow
2148 // (see LogicalStep).
2149 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
2150
2151 // This helper function produces a device name.
2152 auto deviceName = [&devices, kControlFlowInterpreter,
2153 deviceCount](int deviceIndex) -> std::string {
2154 if (deviceIndex == kControlFlowInterpreter) {
2155 return "NNAPI";
2156 } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
2157 return "{unknown}";
2158 } else {
2159 return devices.at(deviceIndex)->getName();
2160 }
2161 };
2162
2163 // This helper function enqueues the operation on the appropriate queue.
2164 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
2165 int deviceIndex = bestDeviceForOperation[operationIndex];
2166 perDeviceQueue[deviceIndex].push(operationIndex);
2167 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
2168 << deviceIndex << " (" << deviceName(deviceIndex) << ")";
2169 };
2170
2171 // This helper function finds a device that has operations ready to process.
2172 // We start by looking at the control flow queue, and then look at the
2173 // devices in reverse order (i.e., starting at the end of the devices
2174 // vector). Earlier devices have a chance to prepare more of the inputs
2175 // required by other devices. This function returns -1 if all queues are
2176 // empty.
2177 auto findNextDeviceToProcess = [&]() -> int {
2178 for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
2179 if (!perDeviceQueue[i].empty()) {
2180 return i;
2181 }
2182 }
2183 return -1;
2184 };
2185
2186 OperandTracker tracker(this, enqueueOnAppropriateDevice);
2187 // For each iteration of this loop, we'll create either an execution step or
2188 // an interpreted control flow construct (including nested execution steps
2189 // and interpreted control flow constructs).
2190 while (true) {
2191 // Find the device we'll do this step for.
2192 int deviceIndex = findNextDeviceToProcess();
2193 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
2194 << deviceName(deviceIndex) << ")";
2195 if (deviceIndex < 0) {
2196 break;
2197 }
2198
2199 // Assign as much as possible to this device.
2200 auto& queue = perDeviceQueue[deviceIndex];
2201 if (deviceIndex != kControlFlowInterpreter) {
2202 ExecutionStep* step =
2203 plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
2204 while (!queue.empty()) {
2205 uint32_t operationIndex = queue.front();
2206 queue.pop();
2207 int n = step->addOperation(operationIndex);
2208 if (n != ANEURALNETWORKS_NO_ERROR) {
2209 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
2210 return n;
2211 }
2212 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2213 }
2214 } else {
2215 while (!queue.empty()) {
2216 uint32_t operationIndex = queue.front();
2217 queue.pop();
2218 const Operation& operation = getOperation(operationIndex);
2219 if (operation.type == OperationType::IF) {
2220 namespace op = operation_if;
2221 const Operand& thenOperand =
2222 getOperand(operation.inputs[op::kThenModelOperand]);
2223 const Operand& elseOperand =
2224 getOperand(operation.inputs[op::kElseModelOperand]);
2225 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2226 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2227 uint32_t thenModelIndex = sourceModels->addModel(thenModel);
2228 uint32_t elseModelIndex = sourceModels->addModel(elseModel);
2229
2230 // Emits the following:
2231 // Index Step
2232 // i if then=(i + 1) else=(j + 1)
2233 // ... (then model steps)
2234 // j goto k
2235 // ... (else model steps)
2236 // k (steps after the IF)
2237 IfStep* ifStep = plan->createNewIfStep();
2238 ifStep->conditionOperandIndex = SourceOperandIndex(
2239 sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
2240 ifStep->thenStepIndex = plan->getNextStepIndex();
2241 NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
2242 thenModelIndex, devices, preference, priority, deadline, plan));
2243 GotoStep* afterThenBranch = plan->createNewGotoStep();
2244 ifStep->elseStepIndex = plan->getNextStepIndex();
2245 NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
2246 elseModelIndex, devices, preference, priority, deadline, plan));
2247 afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
2248
2249 // Outer model operands.
2250 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2251 ifStep->outerInputOperands.emplace_back(sourceModelIndex,
2252 operation.inputs[i]);
2253 }
2254 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2255 ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
2256 operation.outputs[i]);
2257 }
2258 // Then model operands.
2259 for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
2260 ifStep->thenBranchInputOperands.emplace_back(
2261 thenModelIndex, thenModel->getInputOperandIndex(i));
2262 }
2263 for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
2264 ifStep->thenBranchOutputOperands.emplace_back(
2265 thenModelIndex, thenModel->getOutputOperandIndex(i));
2266 }
2267 // Else model operands.
2268 for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
2269 ifStep->elseBranchInputOperands.emplace_back(
2270 elseModelIndex, elseModel->getInputOperandIndex(i));
2271 }
2272 for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
2273 ifStep->elseBranchOutputOperands.emplace_back(
2274 elseModelIndex, elseModel->getOutputOperandIndex(i));
2275 }
2276 } else if (operation.type == OperationType::WHILE) {
2277 namespace op = operation_while;
2278 const Operand& condOperand =
2279 getOperand(operation.inputs[op::kCondModelOperand]);
2280 const Operand& bodyOperand =
2281 getOperand(operation.inputs[op::kBodyModelOperand]);
2282 const ModelBuilder* condModel = getReferencedModel(condOperand);
2283 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2284 uint32_t condModelIndex = sourceModels->addModel(condModel);
2285 uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
2286
2287 // Emits the following:
2288 // Index Step
2289 // i while cond=(i + 1) body=(j + 1) exit=(k + 1)
2290 // ... (cond model steps)
2291 // j goto i
2292 // ... (body model steps)
2293 // k goto i
2294 // ... (steps after the WHILE)
2295 //
2296 // Note that WhileStep has WhileState associated with it.
2297 WhileStep* whileStep = plan->createNewWhileStep();
2298 whileStep->condStepIndex = plan->getNextStepIndex();
2299 NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
2300 condModelIndex, devices, preference, priority, deadline, plan));
2301 GotoStep* afterCond = plan->createNewGotoStep();
2302 afterCond->gotoStepIndex = whileStep->index;
2303 whileStep->bodyStepIndex = plan->getNextStepIndex();
2304 NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
2305 bodyModelIndex, devices, preference, priority, deadline, plan));
2306 GotoStep* afterBody = plan->createNewGotoStep();
2307 afterBody->gotoStepIndex = whileStep->index;
2308 whileStep->exitStepIndex = plan->getNextStepIndex();
2309
2310 // Outer model operands.
2311 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2312 whileStep->outerInputOperands.emplace_back(sourceModelIndex,
2313 operation.inputs[i]);
2314 }
2315 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2316 whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
2317 operation.outputs[i]);
2318 }
2319 // Cond model operands.
2320 for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
2321 whileStep->condInputOperands.emplace_back(
2322 condModelIndex, condModel->getInputOperandIndex(i));
2323 }
2324 whileStep->condOutputOperand =
2325 SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
2326 // Body model operands.
2327 for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
2328 whileStep->bodyInputOperands.emplace_back(
2329 bodyModelIndex, bodyModel->getInputOperandIndex(i));
2330 }
2331 for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
2332 whileStep->bodyOutputOperands.emplace_back(
2333 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
2334 }
2335 } else {
2336 CHECK(false) << operation.type << " is not a control flow operation";
2337 }
2338 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2339 }
2340 }
2341 }
2342 return ANEURALNETWORKS_NO_ERROR;
2343 }
2344
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2345 float ModelBuilder::getPerformance(uint32_t preference,
2346 const std::shared_ptr<Device> device) const {
2347 // Note that we will call this method multiple times per compilation with
2348 // the same arguments if there are nested control flow operations and we
2349 // decide to execute the outer operation on the ExecutionPlan::next()
2350 // interpreter.
2351 //
2352 // This is a potential compilation performance problem. To work around it,
2353 // the performance value could be cached for the duration of a compilation.
2354 float perf = 0;
2355 const size_t operationCount = mOperations.size();
2356 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2357 perf += getPerformance(preference, device, operationIndex);
2358 }
2359 return perf;
2360 }
2361
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2362 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
2363 uint32_t operationIndex) const {
2364 auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) {
2365 return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
2366 };
2367
2368 const Operation& operation = getOperation(operationIndex);
2369
2370 if (operation.type == OperationType::IF) {
2371 namespace op = operation_if;
2372 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2373 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2374 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2375 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2376 return applyPreference(device->getIfPerformance()) +
2377 0.5 * (thenModel->getPerformance(preference, device) +
2378 elseModel->getPerformance(preference, device));
2379 }
2380
2381 if (operation.type == OperationType::WHILE) {
2382 namespace op = operation_while;
2383 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2384 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2385 const ModelBuilder* condModel = getReferencedModel(condOperand);
2386 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2387 return applyPreference(device->getWhilePerformance()) +
2388 condModel->getPerformance(preference, device) +
2389 bodyModel->getPerformance(preference, device);
2390 }
2391
2392 // TODO This assumes that the type is dictated by the first operand. This is
2393 // currently the case but is not a safe assumption to make in the long term.
2394 const uint32_t operandIndex = operation.inputs[0];
2395 const OperandType operandType = mOperands[operandIndex].type;
2396 switch (operandType) {
2397 case OperandType::FLOAT32:
2398 if (mRelaxComputationFloat32toFloat16) {
2399 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
2400 }
2401 break;
2402 case OperandType::TENSOR_FLOAT32:
2403 if (mRelaxComputationFloat32toFloat16) {
2404 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
2405 }
2406 break;
2407 default:
2408 break;
2409 }
2410
2411 return applyPreference(device->getPerformance(operandType));
2412 }
2413
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2414 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
2415 auto containsUnknownSize = [](const ModelBuilder* model,
2416 const std::vector<uint32_t>& operandIndexes) {
2417 for (uint32_t operandIndex : operandIndexes) {
2418 if (hasUnknownSize(model->getOperand(operandIndex))) {
2419 return true;
2420 }
2421 }
2422 return false;
2423 };
2424
2425 const Operation& operation = getOperation(operationIndex);
2426
2427 if (operation.type == OperationType::IF) {
2428 namespace op = operation_if;
2429 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2430 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2431 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2432 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2433 return containsUnknownSize(this, operation.inputs) ||
2434 containsUnknownSize(this, operation.outputs) ||
2435 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
2436 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
2437 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
2438 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
2439 }
2440
2441 if (operation.type == OperationType::WHILE) {
2442 namespace op = operation_while;
2443 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2444 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2445 const ModelBuilder* condModel = getReferencedModel(condOperand);
2446 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2447 return containsUnknownSize(this, operation.inputs) ||
2448 containsUnknownSize(this, operation.outputs) ||
2449 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
2450 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
2451 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
2452 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
2453 }
2454
2455 // Not a control flow operation.
2456 return false;
2457 }
2458
supportedByControlFlowInterpreter(uint32_t operationIndex) const2459 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
2460 const Operation& operation = getOperation(operationIndex);
2461 return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
2462 // The partitioner does not support dynamic temporaries (b/132458982).
2463 !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
2464 }
2465
2466 namespace {
2467
2468 // This class determines whether a given device can execute a given operation
2469 class CanDo {
2470 public:
CanDo()2471 CanDo() {}
2472
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2473 void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
2474 mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
2475 }
2476
check(size_t operationIndex) const2477 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
2478
2479 private:
2480 std::vector<bool> mSupportsOperationByIndex;
2481 };
2482
2483 } // anonymous namespace
2484
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2485 int ModelBuilder::findBestDeviceForEachOperation(
2486 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
2487 std::vector<int>* bestDeviceForOperation) const {
2488 const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing());
2489
2490 const size_t deviceCount = devices.size();
2491 std::vector<CanDo> canDo(deviceCount);
2492 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2493 canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2494 }
2495
2496 // Figure out the best driver for each operation.
2497 const size_t operationCount = mOperations.size();
2498 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2499 const Operation& operation = getOperation(operationIndex);
2500 // Find which device, including CPU fallback, gives the best performance for this operation.
2501 int bestChoice = -1;
2502
2503 if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2504 // Do not schedule control flow operations with unknown size to
2505 // non-CPU devices because this is not supported by the 1.3 HAL.
2506 // See http://b/159076604#comment5.
2507 auto cpuDeviceIterator =
2508 std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2509 if (cpuDeviceIterator != devices.end()) {
2510 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2511 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2512 bestChoice = cpuDeviceIndex;
2513 }
2514 }
2515 } else {
2516 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
2517 bool bestIsUpdatable = false;
2518 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2519 const auto& device = devices[deviceIndex];
2520 if (canDo[deviceIndex].check(operationIndex)) {
2521 const float perfVal = getPerformance(preference, device, operationIndex);
2522 const bool isUpdatable = device->isUpdatable();
2523 const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice() ||
2524 (isUpdatable && !bestIsUpdatable));
2525 if (bestChoice < 0 || perfVal < bestPerfVal ||
2526 (perfVal == bestPerfVal && deviceIsPreferred)) {
2527 bestChoice = deviceIndex;
2528 bestPerfVal = perfVal;
2529 bestIsUpdatable = isUpdatable;
2530 }
2531 } else {
2532 // Somewhat noisy logging, but only place where the user of NNAPI can get
2533 // feedback on why an operation was not run on a specific device.
2534 //
2535 // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2536 // very small.
2537 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2538 << operation.type << ":" << operationIndex;
2539 }
2540 }
2541 }
2542
2543 if (bestChoice < 0) {
2544 LOG(ERROR) << "No driver can do operation " << operation.type;
2545 return ANEURALNETWORKS_BAD_DATA;
2546 } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2547 supportedByControlFlowInterpreter(operationIndex)) {
2548 // Run control flow on the ExecutionPlan::next() interpreter and try
2549 // to delegate referenced models.
2550 const int kControlFlowInterpreter = deviceCount;
2551 (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2552 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2553 << ":" << operationIndex << ") = -1 (NNAPI)";
2554 } else {
2555 (*bestDeviceForOperation)[operationIndex] = bestChoice;
2556 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2557 << ":" << operationIndex << ") = " << bestChoice << " ("
2558 << devices[bestChoice]->getName() << ")";
2559 }
2560 }
2561 return ANEURALNETWORKS_NO_ERROR;
2562 }
2563
2564 } // namespace nn
2565 } // namespace android
2566