1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include <android/sync.h>
22 #include <fcntl.h>
23 #include <openssl/sha.h>
24 #include <sys/stat.h>
25 #include <sys/types.h>
26
27 #include <algorithm>
28 #include <functional>
29 #include <map>
30 #include <memory>
31 #include <mutex>
32 #include <queue>
33 #include <set>
34 #include <string>
35 #include <type_traits>
36 #include <unordered_set>
37 #include <utility>
38 #include <vector>
39
40 #include "BurstBuilder.h"
41 #include "Callbacks.h"
42 #include "CompilationBuilder.h"
43 #include "ControlFlow.h"
44 #include "CpuExecutor.h"
45 #include "ExecutionBuilder.h"
46 #include "ExecutionBurstController.h"
47 #include "GraphDump.h"
48 #include "Manager.h"
49 #include "MetaModel.h"
50 #include "ModelBuilder.h"
51 #include "OperationsUtils.h"
52 #include "TokenHasher.h"
53 #include "Tracing.h"
54 #include "TypeManager.h"
55 #include "Utils.h"
56
57 namespace android {
58 namespace nn {
59
60 namespace {
61
62 using namespace hal;
63
64 // The index of the main model in SourceModels.
65 constexpr uint32_t kMainModelInSourceModels = 0;
66
67 // Compiles the model on device.
68 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
69 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
70 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
71 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const std::optional<Deadline> & deadline,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<PreparedModel> * preparedModel)72 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
73 int compilationPriority, const std::optional<Deadline>& deadline,
74 const std::string& cacheDir, TokenHasher* token,
75 std::shared_ptr<PreparedModel>* preparedModel) {
76 CHECK(token != nullptr);
77 CHECK(preparedModel != nullptr);
78 *preparedModel = nullptr;
79
80 std::optional<CacheToken> cacheToken;
81 if (device.isCachingSupported() && token->ok() &&
82 token->updateFromString(device.getName().c_str()) &&
83 token->updateFromString(device.getVersionString().c_str()) &&
84 token->update(&executionPreference, sizeof(executionPreference)) &&
85 token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
86 cacheToken.emplace(token->getCacheToken());
87 }
88
89 const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
90 const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
91 const Priority priority = convertToHalPriority(compilationPriority);
92 const auto [n, returnedPreparedModel] =
93 device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
94 *preparedModel = returnedPreparedModel;
95 return n;
96 }
97
98 typedef std::function<void(uint32_t)> OperationReadyCallback;
99
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)100 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
101 const Operand& fromOperand) {
102 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
103 fromOperand.extraParams.getDiscriminator() ==
104 OperandExtraParams::hidl_discriminator::channelQuant) {
105 auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
106 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
107 .channelDim = fromChannelQuant.channelDim,
108 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
109 .scales = fromChannelQuant.scales.data(),
110 };
111 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
112 } else if (isExtensionOperandType(fromOperand.type) &&
113 fromOperand.extraParams.getDiscriminator() ==
114 OperandExtraParams::hidl_discriminator::extension) {
115 hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
116 return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
117 extensionData.size());
118 } else if (fromOperand.extraParams.getDiscriminator() !=
119 OperandExtraParams::hidl_discriminator::none ||
120 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
121 LOG(ERROR) << "Type " << toString(fromOperand.type)
122 << " has an unexpected extraParams discriminator: "
123 << static_cast<int>(fromOperand.extraParams.getDiscriminator());
124 return ANEURALNETWORKS_BAD_DATA;
125 } else {
126 return ANEURALNETWORKS_NO_ERROR;
127 }
128 }
129
130 // This class tracks whether we know the value of an operand as operations
131 // are processed.
132 class OperandTracker {
133 public:
134 // Creates the tracker for this model. Figure out which operations can be
135 // executed right away and cb for each one of them.
136 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
137 // Mark the specified operation as having been processed. The output
138 // of the operation now being known, this may make new operations to be
139 // able to run. Call cb for each one of them.
140 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
141
142 private:
143 const ModelBuilder* mModel;
144 std::multimap<uint32_t, uint32_t> mOperandToOperations;
145 std::vector<uint32_t> mUnknownInputCount; // For each operation
146 };
147
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)148 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
149 : mModel(model) {
150 const auto& operations = mModel->getOperations();
151 mUnknownInputCount.resize(operations.size());
152 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
153 const Operation& operation = operations[operationIndex];
154 uint32_t count = 0;
155 for (uint32_t operandIndex : operation.inputs) {
156 auto lifetime = mModel->getOperand(operandIndex).lifetime;
157 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
158 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
159 count++;
160 mOperandToOperations.emplace(operandIndex, operationIndex);
161 }
162 }
163 if (count == 0) {
164 cb(operationIndex);
165 }
166 mUnknownInputCount[operationIndex] = count;
167 }
168 }
169
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)170 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
171 // Mark all its outputs as known.
172 const Operation& operation = mModel->getOperations()[operationIndex];
173 for (uint32_t operandIndex : operation.outputs) {
174 auto range = mOperandToOperations.equal_range(operandIndex);
175 for (auto i = range.first; i != range.second; i++) {
176 uint32_t& count = mUnknownInputCount[i->second];
177 if (--count == 0) {
178 cb(i->second);
179 }
180 }
181 }
182 }
183
184 } // namespace
185
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)186 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
187 std::shared_ptr<Device> device)
188 : mPlan(plan),
189 mIndex(stepIndex),
190 mSourceModelIndex(sourceModelIndex),
191 mStepModel(),
192 mDevice(device),
193 mToken(plan->getCacheToken()) {}
194
195 // Adds an operand if it has not been added already.
196 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)197 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
198 OperandKind kind) {
199 // Have we added this operand already?
200 auto i = mOperandMap.find(sourceOperandIndex);
201 if (i != mOperandMap.end()) {
202 CHECK(kind == INPUT);
203 *stepOperandIndex = i->second;
204 return ANEURALNETWORKS_NO_ERROR;
205 }
206
207 // First time we add this operand.
208 *stepOperandIndex = mStepModel.operandCount();
209 mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
210
211 // Add the operand to the step model.
212 const ModelBuilder& sourceModel = *getSourceModel();
213 const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
214 ANeuralNetworksOperandType type = {
215 .type = static_cast<int32_t>(operand.type),
216 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
217 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
218 .scale = operand.scale,
219 .zeroPoint = operand.zeroPoint,
220 };
221
222 int n = mStepModel.addOperand(type);
223 if (n != ANEURALNETWORKS_NO_ERROR) {
224 LOG(ERROR) << "Previous error occurred when partitioning the graph";
225 return n;
226 }
227
228 n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
229 if (n != ANEURALNETWORKS_NO_ERROR) {
230 LOG(ERROR) << "Error when copying extra parameters to the operand";
231 return n;
232 }
233
234 // Sets its value.
235 switch (operand.lifetime) {
236 case OperandLifeTime::CONSTANT_COPY: {
237 const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
238 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
239 if (n != ANEURALNETWORKS_NO_ERROR) {
240 LOG(ERROR) << "Previous error occurred when partitioning the graph";
241 return n;
242 }
243 } break;
244 case OperandLifeTime::CONSTANT_REFERENCE: {
245 const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
246 n = mStepModel.setOperandValueFromMemory(
247 *stepOperandIndex, memory, operand.location.offset, operand.location.length);
248 if (n != ANEURALNETWORKS_NO_ERROR) {
249 LOG(ERROR) << "Previous error occurred when partitioning the graph";
250 return n;
251 }
252 } break;
253 case OperandLifeTime::NO_VALUE: {
254 n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
255 if (n != ANEURALNETWORKS_NO_ERROR) {
256 LOG(ERROR) << "Previous error occurred when partitioning the graph";
257 return n;
258 }
259 } break;
260 case OperandLifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT
261 if (kind == INPUT) {
262 // The first time we've seen this operand is as an
263 // input. That means it must be defined by a
264 // different partition, and is an input to this one.
265 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
266 } else {
267 // The first time we've seen this operand is as an
268 // output. It may be an input to a different
269 // partition, so keep track of it.
270 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
271 mIndex);
272 }
273 } break;
274 case OperandLifeTime::SUBGRAPH_INPUT: {
275 mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
276 } break;
277 case OperandLifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE
278 if (kind == INPUT) {
279 // The first time we've seen this operand is as an
280 // input. That means it must be defined by a
281 // different partition, and is an input to this one.
282 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
283 } else {
284 // The first time we've seen this operand is as an
285 // output.
286 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
287 }
288 } break;
289 case OperandLifeTime::SUBGRAPH: {
290 const ModelBuilder* model = sourceModel.getReferencedModel(operand);
291 n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
292 if (n != ANEURALNETWORKS_NO_ERROR) {
293 LOG(ERROR) << "Previous error occurred when partitioning the graph";
294 return n;
295 }
296 } break;
297 default: {
298 CHECK(!"unexpected");
299 } break;
300 }
301
302 return ANEURALNETWORKS_NO_ERROR;
303 }
304
addOperation(int operationIndex)305 int ExecutionStep::addOperation(int operationIndex) {
306 const Operation& operation = getSourceModel()->getOperation(operationIndex);
307 if (mToken.ok()) {
308 mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
309 mToken.update(&operationIndex, sizeof(operationIndex));
310 }
311
312 // Convert the input and output operand indexes.
313 //
314 // We expect operations to be added in topological order. Therefore:
315 //
316 // - We may not have seen an input if it is a model input, a
317 // constant, or an operand written by a different partition.
318 //
319 // - We should not have seen any outputs.
320 auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands,
321 std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
322 const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
323 for (uint32_t i = 0; i < operandCount; i++) {
324 NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
325 }
326 return ANEURALNETWORKS_NO_ERROR;
327 };
328
329 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
330 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
331 std::vector<uint32_t> inputs(inputCount);
332 std::vector<uint32_t> outputs(outputCount);
333 NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
334 NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
335 return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
336 outputCount, outputs.data());
337 }
338
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const Memory * temporaryMemory,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOffsetOfTemporary,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const339 void ExecutionStep::mapInputsAndOutputs(
340 std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
341 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
342 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
343 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
344 const std::map<SourceOperandIndex, ConstantReferenceLocation>&
345 sourceOperandToConstantReference) const {
346 auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
347 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
348 if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
349 it != sourceOperandToOffsetOfTemporary.end()) {
350 executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
351 } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
352 it != sourceOperandToInputIndex.end()) {
353 executor->mapInput(it->second, stepInputIndex);
354 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
355 it != sourceOperandToOutputIndex.end()) {
356 executor->mapOutputToInput(it->second, stepInputIndex);
357 } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
358 it != sourceOperandToConstantReference.end()) {
359 // Constant partition boundary operand. This could be an IF branch
360 // model input or a WHILE variable initializer.
361 executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset);
362 } else {
363 CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
364 << toString(sourceOperandIndex);
365 }
366 };
367 auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
368 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
369 if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
370 it != sourceOperandToOffsetOfTemporary.end()) {
371 executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
372 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
373 it != sourceOperandToOutputIndex.end()) {
374 executor->mapOutput(it->second, stepOutputIndex);
375 } else {
376 CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
377 << toString(sourceOperandIndex);
378 }
379 };
380 for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
381 mapInput(mStepModelInputs[i].first, i);
382 }
383 for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
384 mapOutput(mStepModelOutputs[i].first, i);
385 }
386 }
387
findTempsAsStepModelOutputs()388 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
389 auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
390 const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
391 if (it == mTemporaryToDefiningExecutionStep.end()) {
392 // The operand is not a temporary or is not defined by an
393 // ExecutionStep (i.e. it's an output of an IF or a WHILE).
394 // The latter case is handled by ExecutionPlan::makeController().
395 return;
396 }
397 uint32_t stepIndex = it->second;
398 CHECK_LT(stepIndex, mSteps.size());
399 mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
400 };
401 for (const auto& logicalStep : mSteps) {
402 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
403 for (const auto& input : step->getTempsAsStepModelInputs()) {
404 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
405 recordAsOutputIfTemporary(sourceOperandIndex);
406 }
407 } else if (const IfStep* step = logicalStep->tryIfStep()) {
408 recordAsOutputIfTemporary(step->conditionOperandIndex);
409 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
410 recordAsOutputIfTemporary(sourceOperandIndex);
411 }
412 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
413 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
414 recordAsOutputIfTemporary(sourceOperandIndex);
415 }
416 } else {
417 CHECK(logicalStep->isGoto());
418 }
419 }
420 }
421
recordTempAsStepModelOutput(uint32_t stepOperandIndex)422 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
423 const auto it = mOperandMap.find(stepOperandIndex);
424 CHECK(it != mOperandMap.end());
425 mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
426 }
427
getSourceModel() const428 const ModelBuilder* ExecutionStep::getSourceModel() const {
429 return mPlan->getSourceModels().getModel(mSourceModelIndex);
430 }
431
logStepModel() const432 void ExecutionStep::logStepModel() const {
433 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
434
435 auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
436 if (!toLog.empty()) {
437 toLog += ", ";
438 }
439 toLog += toString(e.first);
440 toLog += "->";
441 toLog += toString(e.second);
442 };
443
444 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
445 std::string toLog;
446 for (const auto& e : map) {
447 logRemapEntry(toLog, e);
448 }
449 VLOG(COMPILATION) << name << ": " << toLog;
450 };
451 auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
452 std::string toLog;
453 for (const auto& e : set) {
454 logRemapEntry(toLog, e);
455 }
456 VLOG(COMPILATION) << name << ": " << toLog;
457 };
458
459 logRemapVector("step model inputs", mStepModelInputs);
460 logRemapVector("step model outputs", mStepModelOutputs);
461 logRemapVector("model inputs", mModelInputs);
462 logRemapVector("model outputs", mModelOutputs);
463 logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
464 logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
465 logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
466 }
467
hasUnknownSize(const Operand & operand)468 static bool hasUnknownSize(const Operand& operand) {
469 if (operand.dimensions.size() == 0) {
470 return TypeManager::get()->isTensorType(operand.type);
471 }
472 for (uint32_t dimension : operand.dimensions) {
473 if (dimension == 0) {
474 return true;
475 }
476 }
477 return false;
478 }
479
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)480 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
481 int32_t executionPreference, int32_t priority) {
482 CHECK(mDevice != nullptr);
483
484 for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
485 const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
486 if (hasUnknownSize(operand)) {
487 *hasOutputOfUnknownSize = true;
488 VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first)
489 << " of source graph) has unknown size: " << toString(operand);
490 }
491 }
492
493 mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
494
495 mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
496 mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
497 mTempsAsStepModelInputs.end());
498 mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
499 mOutputsAsStepModelInputs.end());
500
501 mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
502 mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
503 mTempsAsStepModelOutputs.end());
504
505 if (mSourceModelIndex == kMainModelInSourceModels) {
506 std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
507 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
508 mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
509 }
510 std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
511 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
512 mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
513 }
514
515 // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
516 // mModelInputs being the first inputs, as specified by mStepModelInputs.
517 mInputIndexStepModelToMainModel.resize(mModelInputs.size());
518 std::transform(mModelInputs.begin(), mModelInputs.end(),
519 mInputIndexStepModelToMainModel.begin(),
520 [&mainModelOperandToInputIndex](auto& e) {
521 uint32_t sourceOperandIndex = e.first;
522 return mainModelOperandToInputIndex[sourceOperandIndex];
523 });
524
525 // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
526 // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
527 mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
528 std::transform(mModelOutputs.begin(), mModelOutputs.end(),
529 mOutputIndexStepModelToMainModel.begin(),
530 [&mainModelOperandToOutputIndex](auto& e) {
531 uint32_t sourceOperandIndex = e.first;
532 return mainModelOperandToOutputIndex[sourceOperandIndex];
533 });
534
535 // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
536 // on mOutputsAsStepModelInputs being the first outputs.
537 mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
538 std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
539 mOutputsAsStepModelInputsIndexToMainModel.begin(),
540 [&mainModelOperandToOutputIndex](auto& e) {
541 uint32_t sourceOperandIndex = e.first;
542 return mainModelOperandToOutputIndex[sourceOperandIndex];
543 });
544 }
545
546 if (VLOG_IS_ON(COMPILATION)) {
547 logStepModel();
548 }
549
550 std::vector<uint32_t> inputs(mStepModelInputs.size());
551 std::vector<uint32_t> outputs(mStepModelOutputs.size());
552 std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
553 [](auto& e) { return e.second; });
554 std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
555 [](auto& e) { return e.second; });
556 NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
557 outputs.size(), outputs.data()));
558 // TODO: Model::finish() should use ValidationMode::RUNTIME when sending the
559 // step model to CpuDevice. Right now, this is harmless because the only
560 // difference in validation occurs with control flow operations and inputs
561 // or outputs of unknown size and we never send control flow operations to
562 // CpuDevice. We need to address this if this behavior changes (b/151634976).
563 NN_RETURN_IF_ERROR(mStepModel.finish());
564
565 // TODO: Move compilation elsewhere?
566 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
567 return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
568 &mToken, &mPreparedStepModel);
569 }
570
dump() const571 void ExecutionStep::dump() const {
572 if (VLOG_IS_ON(COMPILATION)) {
573 VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
574 logModelToInfo(mStepModel.makeHidlModel());
575 }
576 }
577
toString(const IfStep & step)578 std::string toString(const IfStep& step) {
579 std::ostringstream oss;
580 oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
581 << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
582 return oss.str();
583 }
584
toString(const WhileStep & step)585 std::string toString(const WhileStep& step) {
586 std::ostringstream oss;
587 oss << "Step#" << step.index << ": while cond=" << step.condStepIndex
588 << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
589 return oss.str();
590 }
591
toString(const GotoStep & step)592 std::string toString(const GotoStep& step) {
593 std::ostringstream oss;
594 oss << "Step#" << step.index << ": goto " << step.gotoStepIndex;
595 return oss.str();
596 }
597
dump() const598 void LogicalStep::dump() const {
599 if (VLOG_IS_ON(COMPILATION)) {
600 if (const IfStep* step = tryIfStep()) {
601 VLOG(COMPILATION) << toString(*step);
602 } else if (const WhileStep* step = tryWhileStep()) {
603 VLOG(COMPILATION) << toString(*step);
604 } else if (const GotoStep* step = tryGotoStep()) {
605 VLOG(COMPILATION) << toString(*step);
606 } else {
607 executionStep()->dump();
608 }
609 }
610 }
611
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)612 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
613 int32_t executionPreference, int32_t priority,
614 const std::optional<Deadline>& deadline) {
615 CHECK(!mSuccessfulFinish);
616 CHECK(!deadline.has_value());
617 const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
618
619 auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
620 for (const auto& sourceOperandIndex : operands) {
621 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
622 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
623 if (hasUnknownSize(operand)) {
624 return true;
625 }
626 }
627 return false;
628 };
629
630 findTempsAsStepModelOutputs();
631 for (const auto& logicalStep : mSteps) {
632 if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
633 int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
634 executionPreference, priority);
635 if (n != ANEURALNETWORKS_NO_ERROR) {
636 VLOG(COMPILATION)
637 << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
638 return n;
639 }
640 } else if (IfStep* step = logicalStep->tryIfStep()) {
641 // The partitioner does not support dynamic temporaries (b/132458982).
642 CHECK(!containsUnknownSize(step->outerInputOperands));
643 CHECK(!containsUnknownSize(step->outerOutputOperands));
644 // step->conditionOperandIndex has a static shape. See b/158557728.
645 CHECK(!containsUnknownSize(step->thenBranchInputOperands));
646 CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
647 CHECK(!containsUnknownSize(step->elseBranchInputOperands));
648 CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
649 } else if (WhileStep* step = logicalStep->tryWhileStep()) {
650 // The partitioner does not support dynamic temporaries (b/132458982).
651 CHECK(!containsUnknownSize(step->outerInputOperands));
652 CHECK(!containsUnknownSize(step->outerOutputOperands));
653 CHECK(!containsUnknownSize(step->condInputOperands));
654 // step->condOutputOperand has a static shape. See b/158557728.
655 CHECK(!containsUnknownSize(step->bodyInputOperands));
656 CHECK(!containsUnknownSize(step->bodyOutputOperands));
657 } else {
658 CHECK(logicalStep->isGoto());
659 }
660 }
661 if (mHasStepModelOutputOfUnknownSize) {
662 VLOG(COMPILATION)
663 << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
664 return ANEURALNETWORKS_OP_FAILED;
665 }
666
667 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
668 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
669 mSourceOperandToInputIndex[index] = i;
670 }
671 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
672 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
673 mSourceOperandToOutputIndex[index] = i;
674 }
675
676 findControlFlowBoundaryConstants(sourceModels);
677
678 mSuccessfulFinish = true;
679 return ANEURALNETWORKS_NO_ERROR;
680 }
681
findControlFlowBoundaryConstants(const SourceModels * sourceModels)682 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
683 const SourceModels* sourceModels) {
684 auto handleBoundaryConstants = [this,
685 sourceModels](const SourceOperandIndex& sourceOperandIndex) {
686 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
687 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
688 const DataLocation& location = operand.location;
689 if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) {
690 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
691 .buffer = sourceModel->getPointerToOperandValue(location.offset),
692 .length = location.length,
693 };
694 } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
695 mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
696 .memory = sourceModel->getMemories()[location.poolIndex],
697 .offset = location.offset,
698 .length = location.length,
699 };
700 }
701 };
702 for (const auto& logicalStep : mSteps) {
703 if (const IfStep* step = logicalStep->tryIfStep()) {
704 handleBoundaryConstants(step->conditionOperandIndex);
705 for (const auto& sourceOperandIndex : step->outerInputOperands) {
706 handleBoundaryConstants(sourceOperandIndex);
707 }
708 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
709 for (const auto& sourceOperandIndex : step->outerInputOperands) {
710 handleBoundaryConstants(sourceOperandIndex);
711 }
712 }
713 }
714 }
715
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)716 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
717 int32_t priority, const std::optional<Deadline>& deadline) {
718 CHECK(!mSuccessfulFinish);
719 CHECK(mDevice != nullptr);
720 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
721 const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
722 &mToken, &mPreparedModel);
723 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
724 return n;
725 }
726
finish(int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)727 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
728 const std::optional<Deadline>& deadline) {
729 CHECK(mBody != nullptr);
730 return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
731 }
732
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder)733 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
734 const BurstBuilder* burstBuilder)
735 : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
736
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference)737 ExecutionPlan::Controller::Controller(
738 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
739 const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
740 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
741 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
742 std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
743 std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
744 const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
745 std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
746 : mPlan(plan),
747 mExecutionBuilder(executionBuilder),
748 mBurstBuilder(burstBuilder),
749 mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
750 mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)),
751 mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
752 mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
753 mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
754 mNextStepIndex(0),
755 mFallbackNextStepIndex(kBadStepIndex),
756 mLastStepSyncFd(-1) {
757 if (totalSizeOfTemporaries == 0) {
758 return;
759 }
760 int n;
761 std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
762 if (n != ANEURALNETWORKS_NO_ERROR) {
763 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
764 mNextStepIndex = kBadStepIndex;
765 }
766 for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
767 memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex],
768 location.buffer, location.length);
769 }
770 }
771
772 // Attempt to create a burst object for each PreparedModel/Partition. If the
773 // burst controller object cannot be made, return a nullptr in its place to
774 // indicate the regular execution path should be used. This can occur either
775 // because PreparedModel was nullptr (cpu was best choice), or because the
776 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts(int preference) const777 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
778 int preference) const {
779 switch (mState) {
780 // burst object for each partition in the compound case
781 case COMPOUND: {
782 std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
783 bursts.reserve(compound()->mSteps.size());
784 for (const auto& logicalStep : compound()->mSteps) {
785 if (!logicalStep->isExecution()) {
786 bursts.push_back(nullptr);
787 continue;
788 }
789 if (const auto preparedModel =
790 logicalStep->executionStep()->getPreparedStepModel()) {
791 const bool preferPowerOverLatency =
792 (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
793 bursts.push_back(
794 preparedModel->configureExecutionBurst(preferPowerOverLatency));
795 } else {
796 bursts.push_back(nullptr);
797 }
798 }
799 return bursts;
800 }
801 // single burst object for the simple case
802 case SIMPLE: {
803 std::vector<std::shared_ptr<ExecutionBurstController>> burst;
804 auto simpleBody = simple();
805 if (const auto preparedModel = simpleBody->mPreparedModel) {
806 const bool preferPowerOverLatency =
807 (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
808 burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
809 } else {
810 burst.push_back(nullptr);
811 }
812 return burst;
813 }
814 // no burst objects made
815 default:
816 return {};
817 }
818 }
819
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const820 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
821 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
822 CHECK(isValid());
823 if (mState == SIMPLE) {
824 return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
825 }
826 // Create the layout for a Memory object big enough to hold
827 // - every partition boundary TEMPORARY operand and
828 // - buffers required by the control flow implementation.
829 //
830 // TODO: Rethink this approach for managing temporaries. Some
831 // alternatives:
832 //
833 // 1) Adopt a memory layout scheme analogous to stack allocation,
834 // where objects of non-overlapping lifetime can occupy the same
835 // storage. We would still have a single Memory object in this
836 // case.
837 //
838 // 2) Do something like what CpuExecutor does, and do allocations
839 // and deallocations on the fly (during execution) before first
840 // reference and after last reference, respectively. This would
841 // mean having one Memory object per TEMPORARY; or, in a more
842 // complicated implementation, one Memory object per set of
843 // temporaries that have the same lifetime. Note that the Android
844 // system limits the number of shared memory objects, which are
845 // what our Memory objects represent.
846 //
847 uint32_t totalSizeOfTemporaries = 0;
848 auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
849 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
850 const uint32_t offset = totalSizeOfTemporaries;
851 totalSizeOfTemporaries += size;
852 return offset;
853 };
854 // This function has two modes of operation:
855 // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
856 // TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
857 // operands, and panic if we see a source operand of another lifetime.
858 // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
859 // SUBGRAPH_OUTPUT source operands and panic if we see a source operand
860 // of another lifetime.
861 auto mapTemporary =
862 [executionBuilder, addTemporaryOfSize](
863 const SourceOperandIndex& sourceOperandIndex,
864 std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
865 OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
866 CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
867 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT);
868 const Operand& sourceOperand =
869 executionBuilder->getSourceOperand(sourceOperandIndex);
870 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE &&
871 sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
872 // See the caller for explanation.
873 return;
874 }
875 CHECK(sourceOperand.lifetime == lifetime);
876 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
877 CHECK_NE(size, 0u);
878 const uint32_t offset = addTemporaryOfSize(size);
879 auto [_, isNew] =
880 sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
881 CHECK(isNew);
882 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
883 << " offset = " << offset;
884 };
885 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
886 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
887 for (const auto& logicalStep : compound()->mSteps) {
888 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
889 // Allocate memory for ExecutionStep temporary outputs that are
890 // inputs to other steps, as determined by
891 // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
892 //
893 // We don't allocate memory for step model output operands with
894 // source operand lifetime SUBGRAPH_OUTPUT because they will be
895 // - managed by the client (main model outputs),
896 // - assigned a location of another operand (when this step model
897 // output is a branch model output of an IF; see
898 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
899 // - allocated by a WHILE (when this step model output
900 // is a condition or body model output of a WHILE; see the
901 // step->bodyOutputOperands and step->condOutputOperand handling
902 // below).
903 for (const auto& output : step->getTempsAsStepModelOutputs()) {
904 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
905 &sourceOperandToOffsetOfTemporary);
906 }
907 } else if (const IfStep* step = logicalStep->tryIfStep()) {
908 // Allocate memory for all temporary outputs of an IfStep because
909 // they are going to be written to by a branch model. We don't
910 // perform unused output operand optimisation for referenced models.
911 //
912 // We don't allocate memory for branch output operands because they
913 // use the same location as the corresponding outer output operands,
914 // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
915 //
916 // We don't allocate memory for outer output operands with source
917 // operand lifetime SUBGRAPH_OUTPUT because they will be
918 // - managed by the client (main model outputs),
919 // - assigned a location of another operand (when this IF outer
920 // output is a branch model output of another IF; see
921 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
922 // - allocated by a WHILE (when this IF outer output
923 // is a condition or body model output of a WHILE; see the
924 // step->bodyOutputOperands and step->condOutputOperand handling
925 // below).
926 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
927 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
928 }
929 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
930 // Allocate memory for all temporary outputs of an WhileStep because
931 // they are going to be written to by the WHILE loop.
932 //
933 // We don't allocate memory for outer output operands with source
934 // operand lifetime SUBGRAPH_OUTPUT because they will be
935 // - managed by the client (main model outputs),
936 // - assigned a location of another operand (when this WHILE outer
937 // output is a branch model output of an IF; see
938 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
939 // - allocated by another WHILE (when this WHILE outer output
940 // is a condition or body model output of another WHILE; see the
941 // step->bodyOutputOperands and step->condOutputOperand handling
942 // below).
943 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
944 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
945 }
946 // Allocate memory for body model outputs. Note that we could use
947 // the outer output operand memory instead but we currently don't do
948 // so (b/148206073).
949 for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
950 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary,
951 OperandLifeTime::SUBGRAPH_OUTPUT);
952 // Allocate another set of temporaries for double buffering.
953 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2,
954 OperandLifeTime::SUBGRAPH_OUTPUT);
955 }
956 // Allocate memory for condition model output.
957 // TODO: Share one condition output memory region between all loops.
958 mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary,
959 OperandLifeTime::SUBGRAPH_OUTPUT);
960 } else {
961 CHECK(logicalStep->isGoto());
962 }
963 }
964 // Allocate temporary memory for boundary CONSTANT_COPY operands.
965 for (const auto& [sourceOperandIndex, location] :
966 compound()->mSourceOperandToBoundaryConstantCopy) {
967 const uint32_t offset = addTemporaryOfSize(location.length);
968 sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
969 VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
970 << " offset = " << offset;
971 }
972 return std::shared_ptr<Controller>(new Controller(
973 this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
974 std::move(sourceOperandToOffsetOfTemporary),
975 std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
976 compound()->mSourceOperandToOutputIndex,
977 compound()->mSourceOperandToBoundaryConstantCopy,
978 compound()->mSourceOperandToBoundaryConstantReference));
979 }
980
981 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const982 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
983 std::shared_ptr<StepExecutor>* executor) const {
984 *executor = nullptr;
985
986 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
987 << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
988
989 if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
990 // We haven't called next().
991 return ANEURALNETWORKS_OP_FAILED;
992 }
993
994 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
995 // The last call to next() did not produce an executor.
996 return ANEURALNETWORKS_OP_FAILED;
997 }
998
999 controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1000 return next(controller, executor);
1001 }
1002
Buffer(void * pointer,uint32_t size)1003 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1004 : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)),
1005 mOffset(0) {}
1006
Buffer(RunTimePoolInfo info,uint32_t offset)1007 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1008 : mInfo(std::move(info)), mOffset(offset) {}
1009
getPointer() const1010 void* ExecutionPlan::Buffer::getPointer() const {
1011 return mInfo.getBuffer() + mOffset;
1012 }
1013
getSize() const1014 uint32_t ExecutionPlan::Buffer::getSize() const {
1015 return mInfo.getSize() - mOffset;
1016 }
1017
flush() const1018 void ExecutionPlan::Buffer::flush() const {
1019 mInfo.flush();
1020 }
1021
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1022 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1023 const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1024 switch (info.state()) {
1025 case ModelArgumentInfo::POINTER: {
1026 return Buffer(info.buffer(), info.length());
1027 } break;
1028 case ModelArgumentInfo::MEMORY: {
1029 if (std::optional<RunTimePoolInfo> poolInfo =
1030 executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1031 return Buffer(*poolInfo, info.locationAndLength().offset);
1032 } else {
1033 LOG(ERROR) << "Unable to map operand memory pool";
1034 return std::nullopt;
1035 }
1036 } break;
1037 case ModelArgumentInfo::HAS_NO_VALUE: {
1038 LOG(ERROR) << "Attempting to read an operand that has no value";
1039 return std::nullopt;
1040 } break;
1041 default: {
1042 LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1043 return std::nullopt;
1044 } break;
1045 }
1046 }
1047
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1048 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1049 std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1050 const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary;
1051 const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1052 const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1053 const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1054 if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex);
1055 it != sourceOperandToOffsetOfTemporary.end()) {
1056 const uint32_t offset = it->second;
1057 const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1058 return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1059 } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1060 it != sourceOperandToInputIndex.end()) {
1061 const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1062 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1063 } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1064 it != sourceOperandToOutputIndex.end()) {
1065 const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1066 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1067 } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1068 it != sourceOperandToConstantReference.end()) {
1069 const ConstantReferenceLocation& location = it->second;
1070 const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1071 if (info == std::nullopt) {
1072 return std::nullopt;
1073 }
1074 return Buffer(info->getBuffer() + location.offset, location.length);
1075 }
1076 return std::nullopt;
1077 }
1078
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1079 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1080 SourceOperandIndex operandIndex, bool* value) const {
1081 std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1082 if (buffer == std::nullopt) {
1083 LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1084 return ANEURALNETWORKS_OP_FAILED;
1085 }
1086 CHECK_GE(buffer->getSize(), sizeof(bool8));
1087 bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1088 *value = static_cast<bool>(value8);
1089 VLOG(EXECUTION) << "readConditionValue: " << *value;
1090 return ANEURALNETWORKS_NO_ERROR;
1091 }
1092
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController,int syncFdOfLastStep) const1093 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1094 std::shared_ptr<StepExecutor>* executor,
1095 std::shared_ptr<ExecutionBurstController>* burstController,
1096 int syncFdOfLastStep) const {
1097 controller->mLastStepSyncFd = syncFdOfLastStep;
1098 *executor = nullptr;
1099 if (burstController != nullptr) {
1100 *burstController = nullptr;
1101 }
1102
1103 VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1104 << "): mNextStepIndex = " << controller->mNextStepIndex;
1105
1106 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1107 return ANEURALNETWORKS_OP_FAILED;
1108 }
1109
1110 if (mState == EMPTY) {
1111 CHECK_EQ(controller->mNextStepIndex, 0u); // end
1112 controller->mNextStepIndex = Controller::kBadStepIndex;
1113 return ANEURALNETWORKS_NO_ERROR;
1114 }
1115
1116 if (mState == SIMPLE) {
1117 if (controller->mNextStepIndex == 0) {
1118 // First (and only) step.
1119 auto simpleBody = simple();
1120 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
1121 simpleBody->mModel, simpleBody->mDevice,
1122 simpleBody->mPreparedModel);
1123 (*executor)->mapInputsAndOutputsTrivially();
1124 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1125 *burstController = controller->mBurstBuilder->getControllerAt(0);
1126 }
1127 controller->mFallbackNextStepIndex = 0;
1128 controller->mNextStepIndex = 1;
1129 return ANEURALNETWORKS_NO_ERROR;
1130 }
1131
1132 CHECK_EQ(controller->mNextStepIndex, 1u); // end
1133 controller->mNextStepIndex = Controller::kBadStepIndex;
1134 return ANEURALNETWORKS_NO_ERROR;
1135 }
1136
1137 return nextCompound(controller, executor, burstController);
1138 }
1139
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1140 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1141 std::shared_ptr<StepExecutor>* executor,
1142 std::shared_ptr<ExecutionBurstController>* burstController) const {
1143 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1144 return ANEURALNETWORKS_OP_FAILED;
1145 }
1146
1147 auto compoundBody = compound();
1148 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1149 controller->mNextStepIndex = Controller::kBadStepIndex; // end
1150 return ANEURALNETWORKS_NO_ERROR;
1151 }
1152
1153 const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1154 if (const IfStep* step = logicalStep->tryIfStep()) {
1155 return nextCompound(step, controller, executor, burstController);
1156 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1157 return nextCompound(step, controller, executor, burstController);
1158 } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1159 return nextCompound(step, controller, executor, burstController);
1160 } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1161 return nextCompound(step, controller, executor, burstController);
1162 } else {
1163 CHECK(false) << "Unknown step variant";
1164 return ANEURALNETWORKS_BAD_STATE;
1165 }
1166 }
1167
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1168 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1169 std::shared_ptr<StepExecutor>* executor,
1170 std::shared_ptr<ExecutionBurstController>* burstController) const {
1171 VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1172 << step->getDevice()->getName();
1173 *executor =
1174 std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1175 step->getDevice(), step->getPreparedStepModel(), step);
1176 step->mapInputsAndOutputs(
1177 *executor, controller->mTemporaries.get(),
1178 controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
1179 controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
1180 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1181 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1182 }
1183
1184 controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1185 controller->mNextStepIndex++;
1186 return ANEURALNETWORKS_NO_ERROR;
1187 }
1188
1189 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1190 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1191 const SourceOperandIndex& innerOperand) {
1192 VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1193 << toString(outerOperand);
1194 #ifdef NN_DEBUGGABLE
1195 CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1196 mSourceOperandToInputIndex.count(innerOperand) +
1197 mSourceOperandToOutputIndex.count(innerOperand) +
1198 mSourceOperandToConstantReference.count(innerOperand),
1199 1u);
1200 #endif
1201 mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1202 mSourceOperandToInputIndex.erase(innerOperand);
1203 mSourceOperandToOutputIndex.erase(innerOperand);
1204 mSourceOperandToConstantReference.erase(innerOperand);
1205 if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1206 it != mSourceOperandToOffsetOfTemporary.end()) {
1207 mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1208 } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1209 it != mSourceOperandToInputIndex.end()) {
1210 mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1211 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1212 it != mSourceOperandToOutputIndex.end()) {
1213 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1214 } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1215 it != mSourceOperandToConstantReference.end()) {
1216 mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1217 } else {
1218 CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1219 << " from operand " << toString(outerOperand);
1220 }
1221 }
1222
1223 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1224 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1225 const SourceOperandIndex& innerOperand) {
1226 VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1227 << toString(outerOperand);
1228 #ifdef NN_DEBUGGABLE
1229 CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1230 mSourceOperandToOutputIndex.count(innerOperand),
1231 1u);
1232 #endif
1233 mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1234 mSourceOperandToOutputIndex.erase(innerOperand);
1235 if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1236 it != mSourceOperandToOffsetOfTemporary.end()) {
1237 mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1238 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1239 it != mSourceOperandToOutputIndex.end()) {
1240 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1241 } else {
1242 CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1243 << " from operand " << toString(outerOperand);
1244 }
1245 }
1246
waitForLastStepSyncFence() const1247 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1248 if (mLastStepSyncFd == -1) {
1249 return ANEURALNETWORKS_NO_ERROR;
1250 }
1251 VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1252 auto r = syncWait(mLastStepSyncFd, -1);
1253 int n = ANEURALNETWORKS_NO_ERROR;
1254 if (r != FenceState::SIGNALED) {
1255 LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1256 n = ANEURALNETWORKS_OP_FAILED;
1257 }
1258 return n;
1259 }
1260
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1261 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1262 std::shared_ptr<StepExecutor>* executor,
1263 std::shared_ptr<ExecutionBurstController>* burstController) const {
1264 VLOG(EXECUTION) << "next: " << toString(*step);
1265 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1266 // This is safe because the steps are serialized when doing fenced compute.
1267 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1268 bool condValue;
1269 NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1270 controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1271 const std::vector<SourceOperandIndex>& branchInputOperands =
1272 condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1273 const std::vector<SourceOperandIndex>& branchOutputOperands =
1274 condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1275 CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1276 CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1277 for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1278 // We have to do this assignment just before executing this step to
1279 // accommodate cases when the IF resides within a WHILE condition or
1280 // body model and for some j the i-th input of the IF branch model is
1281 // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1282 // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1283 // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1284 // In such cases, the WhileStep modifies the location of
1285 // step->outerInputOperands[i] to implement double buffering.
1286 controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1287 }
1288 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1289 // We have to do this assignment just before executing this step to
1290 // accommodate the case when the IF resides within a WHILE body
1291 // model and the i-th output of the IF branch model is an
1292 // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1293 // some j). In that case, the WhileStep modifies the location of
1294 // step->outerOutputOperands[i] to implement double buffering.
1295 controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1296 }
1297 return nextCompound(controller, executor, burstController);
1298 }
1299
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1300 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1301 std::shared_ptr<StepExecutor>* executor,
1302 std::shared_ptr<ExecutionBurstController>* burstController) const {
1303 WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1304 if (state.stage == WhileState::EVALUATE_CONDITION) {
1305 state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1306 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1307 << ": evaluating condition";
1308 controller->mNextStepIndex = step->condStepIndex;
1309
1310 if (state.iteration == 0) {
1311 state.startTime = std::chrono::steady_clock::now();
1312 }
1313
1314 // iteration = 0 cond inputs = outer inputs
1315 // iteration = 1 cond inputs = body outputs
1316 // iteration = 2 cond inputs = body outputs
1317 // iteration = 3 cond inputs = ...
1318 uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1319 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1320 CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1321 for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1322 bool operandIsInputOnly = i >= loopBodyOutputCount;
1323 controller->setInput((state.iteration == 0 || operandIsInputOnly)
1324 ? step->outerInputOperands[i]
1325 : step->bodyOutputOperands[i],
1326 step->condInputOperands[i]);
1327 }
1328
1329 state.stage = WhileState::EVALUATE_BODY;
1330 return nextCompound(controller, executor, burstController);
1331 }
1332
1333 CHECK(state.stage == WhileState::EVALUATE_BODY);
1334 std::chrono::nanoseconds timeoutDuration(
1335 controller->mExecutionBuilder->getLoopTimeoutDuration());
1336 auto duration = std::chrono::steady_clock::now() - state.startTime;
1337 if (duration > timeoutDuration) {
1338 LOG(ERROR) << "WHILE loop timed out after "
1339 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1340 << " ms";
1341 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1342 }
1343
1344 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1345 // This is safe because the steps are serialized when doing fenced compute.
1346 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1347 bool condValue;
1348 NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1349 if (condValue) {
1350 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1351 << ": evaluating body";
1352 controller->mNextStepIndex = step->bodyStepIndex;
1353
1354 // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1
1355 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2
1356 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1
1357 // iteration = 3 body inputs = cond inputs = ... body outputs = ...
1358 #ifdef NN_DEBUGGABLE
1359 CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1360 CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1361 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1362 CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1363 #endif
1364 for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1365 controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1366 }
1367 if (state.iteration != 0) {
1368 for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1369 #ifdef NN_DEBUGGABLE
1370 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1371 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1372 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u);
1373 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u);
1374 #endif
1375 std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand],
1376 controller->mSourceOperandToOffsetOfTemporary2[outputOperand]);
1377 }
1378 }
1379 } else {
1380 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1381 << ": exiting loop";
1382 controller->mNextStepIndex = step->exitStepIndex;
1383
1384 // Copy body outputs to outer outputs.
1385 // TODO: Use outer outputs instead of tmp2 to avoid copying?
1386 CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1387 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1388 // condInputOperands[i] points to a body output operand from the
1389 // last iteration if we've executed at least one iteration and to a
1390 // WHILE operation input operand otherwise.
1391 const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1392 const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1393 std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1394 if (outerBuffer == std::nullopt) {
1395 // This should never happen.
1396 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1397 return ANEURALNETWORKS_OP_FAILED;
1398 }
1399 const Operand& sourceOperand =
1400 controller->mExecutionBuilder->getSourceOperand(outerOperand);
1401 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1402 CHECK_NE(size, 0u);
1403 std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1404 if (innerBuffer == std::nullopt) {
1405 // This should never happen.
1406 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1407 return ANEURALNETWORKS_OP_FAILED;
1408 }
1409 CHECK_LE(size, innerBuffer->getSize());
1410 CHECK_LE(size, outerBuffer->getSize());
1411 memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1412 outerBuffer->flush();
1413 }
1414 state.iteration = WhileState::kOutsideLoop;
1415 }
1416
1417 state.stage = WhileState::EVALUATE_CONDITION;
1418 return nextCompound(controller, executor, burstController);
1419 }
1420
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1421 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1422 std::shared_ptr<StepExecutor>* executor,
1423 std::shared_ptr<ExecutionBurstController>* burstController) const {
1424 VLOG(EXECUTION) << "next: " << toString(*step);
1425 controller->mNextStepIndex = step->gotoStepIndex;
1426 return nextCompound(controller, executor, burstController);
1427 }
1428
becomeCompoundIfEmpty()1429 void ExecutionPlan::becomeCompoundIfEmpty() {
1430 CHECK(mState != SIMPLE);
1431 if (mState == EMPTY) {
1432 mBody = new CompoundBody();
1433 mState = COMPOUND;
1434 }
1435 }
1436
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1437 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1438 const std::shared_ptr<Device> device) {
1439 becomeCompoundIfEmpty();
1440 auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1441 compound()->mSteps.size(), sourceModelIndex, device);
1442 compound()->mSteps.push_back(step);
1443 return step->executionStep();
1444 }
1445
createNewIfStep()1446 IfStep* ExecutionPlan::createNewIfStep() {
1447 becomeCompoundIfEmpty();
1448 auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1449 step->ifStep()->index = compound()->mSteps.size();
1450 compound()->mSteps.push_back(step);
1451 return step->ifStep();
1452 }
1453
createNewWhileStep()1454 WhileStep* ExecutionPlan::createNewWhileStep() {
1455 becomeCompoundIfEmpty();
1456 auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1457 step->whileStep()->index = compound()->mSteps.size();
1458 compound()->mSteps.push_back(step);
1459 return step->whileStep();
1460 }
1461
createNewGotoStep()1462 GotoStep* ExecutionPlan::createNewGotoStep() {
1463 becomeCompoundIfEmpty();
1464 auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1465 step->gotoStep()->index = compound()->mSteps.size();
1466 compound()->mSteps.push_back(step);
1467 return step->gotoStep();
1468 }
1469
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1470 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1471 const ModelBuilder* model) {
1472 CHECK(mState == EMPTY);
1473 mBody = new SimpleBody(device, model, mCacheDir, mToken);
1474 mState = SIMPLE;
1475 }
1476
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1477 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1478 auto [it, isNew] =
1479 compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1480 CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1481 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1482 }
1483
dump() const1484 void ExecutionPlan::dump() const {
1485 if (mBody) {
1486 mBody->dump();
1487 } else {
1488 VLOG(COMPILATION) << "EMPTY";
1489 }
1490 }
1491
reset()1492 void ExecutionPlan::reset() {
1493 if (mBody) {
1494 delete mBody;
1495 mBody = nullptr;
1496 }
1497 mState = EMPTY;
1498 }
1499
isSimpleCpu() const1500 bool ExecutionPlan::isSimpleCpu() const {
1501 return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1502 }
1503
forTest_getKind() const1504 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1505 switch (mState) {
1506 case EMPTY:
1507 return Kind::EMPTY;
1508 case SIMPLE:
1509 nnAssert(mBody);
1510 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1511 case COMPOUND:
1512 nnAssert(mBody);
1513 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1514 default:
1515 nnAssert(!"unexpected state");
1516 return Kind::ERROR;
1517 }
1518 }
1519
forTest_simpleGetDevice() const1520 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1521 return simple()->mDevice;
1522 }
1523
forTest_compoundGetSteps() const1524 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1525 return compound()->mSteps;
1526 }
1527
forTest_hasStepModelOutputsOfUnknownSize() const1528 bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
1529 return mBody->hasStepModelOutputsOfUnknownSize();
1530 }
1531
forTest_simpleGetCacheToken() const1532 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1533 return simple()->mToken.getCacheToken();
1534 }
1535
dump() const1536 void ExecutionPlan::SimpleBody::dump() const {
1537 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1538 }
1539
dump() const1540 void ExecutionPlan::CompoundBody::dump() const {
1541 for (const auto& step : mSteps) {
1542 step->dump();
1543 }
1544 }
1545
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1546 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
1547 const StepRoleCallback& callback) const {
1548 callback(mPreparedModel.get(), IOType::INPUT, index);
1549 }
1550
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1551 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
1552 const StepRoleCallback& callback) const {
1553 callback(mPreparedModel.get(), IOType::OUTPUT, index);
1554 }
1555
1556 // Map an input role of the main model to the input/output roles in the step models:
1557 // - An input role of the main model may be used as an input of multiple step models.
1558 // - An input role of the main model should not be used as an output of any step model.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1559 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
1560 const StepRoleCallback& callback) const {
1561 for (const auto& logicalStep : mSteps) {
1562 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1563 // Model input as step model input.
1564 const auto& inputMapping = step->getInputIndexStepModelToMainModel();
1565 for (uint32_t i = 0; i < inputMapping.size(); i++) {
1566 if (inputMapping[i] == index) {
1567 callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1568 }
1569 }
1570 }
1571 }
1572 }
1573
1574 // Map an output role of the main model to the input/output roles in the step models:
1575 // - An output role of the main model may only be used as one output of one single step model.
1576 // - An output role of the main model may be used as an input of multiple step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1577 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
1578 const StepRoleCallback& callback) const {
1579 bool found = false;
1580 for (const auto& logicalStep : mSteps) {
1581 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1582 // Model output as step model output.
1583 if (!found) {
1584 const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
1585 for (uint32_t i = 0; i < outputMapping.size(); i++) {
1586 if (outputMapping[i] == index) {
1587 callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
1588 found = true;
1589 break;
1590 }
1591 }
1592 }
1593 // Model output as step model input.
1594 const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
1595 for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
1596 if (inputToOutputMapping[i] == index) {
1597 callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1598 }
1599 }
1600 }
1601 }
1602 }
1603
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1604 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
1605 uint32_t preference, uint32_t priority,
1606 const std::optional<Deadline>& deadline,
1607 ExecutionPlan* plan) const {
1608 uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
1609 NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
1610 deadline, plan));
1611 int n = plan->finish(preference, priority, deadline);
1612 if (VLOG_IS_ON(COMPILATION)) {
1613 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
1614 logModelToInfo(makeHidlModel());
1615 plan->dump();
1616 }
1617 return n;
1618 }
1619
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1620 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
1621 const std::vector<std::shared_ptr<Device>>& devices,
1622 uint32_t preference, uint32_t priority,
1623 const std::optional<Deadline>& deadline,
1624 ExecutionPlan* plan) const {
1625 // This function uses a heuristic approach to partitioning the graph.
1626 // It should be good enough for the first release.
1627
1628 SourceModels* sourceModels = &plan->getSourceModels();
1629 const size_t deviceCount = devices.size();
1630 const size_t operationCount = mOperations.size();
1631
1632 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
1633 << "sourceModelIndex = " << sourceModelIndex << ", "
1634 << "deviceCount = " << deviceCount << ", "
1635 << "operationCount = " << operationCount;
1636
1637 // Figure out where each operation will best execute.
1638 // The value of the vector is the index in the devices vector.
1639 std::vector<int> bestDeviceForOperation(operationCount);
1640 NN_RETURN_IF_ERROR(
1641 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
1642
1643 // A special value produced by findBestDeviceForEachOperation meaning that
1644 // this is a control flow operation scheduled for interpreted execution
1645 // (see LogicalStep).
1646 const int kControlFlowInterpreter = deviceCount;
1647
1648 // If one device will run all the operations, we don't need to split the
1649 // work. This shortcut does not apply when recursively partitioning
1650 // referenced models because our plan representation is flat.
1651 if (sourceModelIndex == kMainModelInSourceModels &&
1652 std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
1653 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
1654 const int bestDeviceIndex = bestDeviceForOperation[0];
1655 // Bypass the partitioning process unless the only operation is a
1656 // control flow operation scheduled for interpreted execution.
1657 if (bestDeviceIndex != kControlFlowInterpreter) {
1658 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
1659 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
1660 plan->becomeSingleStep(devices[bestDeviceIndex], this);
1661 return ANEURALNETWORKS_NO_ERROR;
1662 }
1663 }
1664
1665 // No easy solution, we need to split the work.
1666
1667 // We keep track of the operations that are ready to run for each device.
1668 // perDeviceQueue[deviceCount] is for interpreted execution of control flow
1669 // (see LogicalStep).
1670 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
1671
1672 // This helper function enqueues the operation on the appropriate queue.
1673 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
1674 int deviceIndex = bestDeviceForOperation[operationIndex];
1675 perDeviceQueue[deviceIndex].push(operationIndex);
1676 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1677 << deviceIndex;
1678 };
1679
1680 // This helper function finds a device that has operations ready to process.
1681 // We start by looking at the control flow queue, and then look at the
1682 // devices in reverse order (i.e., starting at the end of the devices
1683 // vector). Earlier devices have a chance to prepare more of the inputs
1684 // required by other devices. This function returns -1 if all queues are
1685 // empty.
1686 auto findNextDeviceToProcess = [&]() -> int {
1687 for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
1688 if (!perDeviceQueue[i].empty()) {
1689 return i;
1690 }
1691 }
1692 return -1;
1693 };
1694
1695 OperandTracker tracker(this, enqueueOnAppropriateDevice);
1696 // For each iteration of this loop, we'll create an execution step.
1697 while (true) {
1698 // Find the device we'll do this step for.
1699 int deviceIndex = findNextDeviceToProcess();
1700 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1701 if (deviceIndex < 0) {
1702 break;
1703 }
1704
1705 // Assign as much as possible to this device.
1706 auto& queue = perDeviceQueue[deviceIndex];
1707 if (deviceIndex != kControlFlowInterpreter) {
1708 ExecutionStep* step =
1709 plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
1710 while (!queue.empty()) {
1711 uint32_t operationIndex = queue.front();
1712 queue.pop();
1713 int n = step->addOperation(operationIndex);
1714 if (n != ANEURALNETWORKS_NO_ERROR) {
1715 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1716 return n;
1717 }
1718 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1719 }
1720 } else {
1721 while (!queue.empty()) {
1722 uint32_t operationIndex = queue.front();
1723 queue.pop();
1724 const Operation& operation = getOperation(operationIndex);
1725 if (operation.type == OperationType::IF) {
1726 namespace op = operation_if;
1727 const Operand& thenOperand =
1728 getOperand(operation.inputs[op::kThenModelOperand]);
1729 const Operand& elseOperand =
1730 getOperand(operation.inputs[op::kElseModelOperand]);
1731 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1732 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1733 uint32_t thenModelIndex = sourceModels->addModel(thenModel);
1734 uint32_t elseModelIndex = sourceModels->addModel(elseModel);
1735
1736 // Emits the following:
1737 // Index Step
1738 // i if then=(i + 1) else=(j + 1)
1739 // ... (then model steps)
1740 // j goto k
1741 // ... (else model steps)
1742 // k (steps after the IF)
1743 IfStep* ifStep = plan->createNewIfStep();
1744 ifStep->conditionOperandIndex = SourceOperandIndex(
1745 sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
1746 ifStep->thenStepIndex = plan->getNextStepIndex();
1747 NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
1748 thenModelIndex, devices, preference, priority, deadline, plan));
1749 GotoStep* afterThenBranch = plan->createNewGotoStep();
1750 ifStep->elseStepIndex = plan->getNextStepIndex();
1751 NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
1752 elseModelIndex, devices, preference, priority, deadline, plan));
1753 afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
1754
1755 // Outer model operands.
1756 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1757 ifStep->outerInputOperands.emplace_back(sourceModelIndex,
1758 operation.inputs[i]);
1759 }
1760 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1761 ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
1762 operation.outputs[i]);
1763 }
1764 // Then model operands.
1765 for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
1766 ifStep->thenBranchInputOperands.emplace_back(
1767 thenModelIndex, thenModel->getInputOperandIndex(i));
1768 }
1769 for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
1770 ifStep->thenBranchOutputOperands.emplace_back(
1771 thenModelIndex, thenModel->getOutputOperandIndex(i));
1772 }
1773 // Else model operands.
1774 for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
1775 ifStep->elseBranchInputOperands.emplace_back(
1776 elseModelIndex, elseModel->getInputOperandIndex(i));
1777 }
1778 for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
1779 ifStep->elseBranchOutputOperands.emplace_back(
1780 elseModelIndex, elseModel->getOutputOperandIndex(i));
1781 }
1782 } else if (operation.type == OperationType::WHILE) {
1783 namespace op = operation_while;
1784 const Operand& condOperand =
1785 getOperand(operation.inputs[op::kCondModelOperand]);
1786 const Operand& bodyOperand =
1787 getOperand(operation.inputs[op::kBodyModelOperand]);
1788 const ModelBuilder* condModel = getReferencedModel(condOperand);
1789 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1790 uint32_t condModelIndex = sourceModels->addModel(condModel);
1791 uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
1792
1793 // Emits the following:
1794 // Index Step
1795 // i while cond=(i + 1) body=(j + 1) exit=(k + 1)
1796 // ... (cond model steps)
1797 // j goto i
1798 // ... (body model steps)
1799 // k goto i
1800 // ... (steps after the WHILE)
1801 //
1802 // Note that WhileStep has WhileState associated with it.
1803 WhileStep* whileStep = plan->createNewWhileStep();
1804 whileStep->condStepIndex = plan->getNextStepIndex();
1805 NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
1806 condModelIndex, devices, preference, priority, deadline, plan));
1807 GotoStep* afterCond = plan->createNewGotoStep();
1808 afterCond->gotoStepIndex = whileStep->index;
1809 whileStep->bodyStepIndex = plan->getNextStepIndex();
1810 NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
1811 bodyModelIndex, devices, preference, priority, deadline, plan));
1812 GotoStep* afterBody = plan->createNewGotoStep();
1813 afterBody->gotoStepIndex = whileStep->index;
1814 whileStep->exitStepIndex = plan->getNextStepIndex();
1815
1816 // Outer model operands.
1817 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1818 whileStep->outerInputOperands.emplace_back(sourceModelIndex,
1819 operation.inputs[i]);
1820 }
1821 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1822 whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
1823 operation.outputs[i]);
1824 }
1825 // Cond model operands.
1826 for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
1827 whileStep->condInputOperands.emplace_back(
1828 condModelIndex, condModel->getInputOperandIndex(i));
1829 }
1830 whileStep->condOutputOperand =
1831 SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
1832 // Body model operands.
1833 for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
1834 whileStep->bodyInputOperands.emplace_back(
1835 bodyModelIndex, bodyModel->getInputOperandIndex(i));
1836 }
1837 for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
1838 whileStep->bodyOutputOperands.emplace_back(
1839 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
1840 }
1841 } else {
1842 CHECK(false) << toString(operation.type) << " is not a control flow operation";
1843 }
1844 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1845 }
1846 }
1847 }
1848 return ANEURALNETWORKS_NO_ERROR;
1849 }
1850
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const1851 float ModelBuilder::getPerformance(uint32_t preference,
1852 const std::shared_ptr<Device> device) const {
1853 // Note that we will call this method multiple times per compilation with
1854 // the same arguments if there are nested control flow operations and we
1855 // decide to execute the outer operation on the ExecutionPlan::next()
1856 // interpreter.
1857 //
1858 // This is a potential compilation performance problem. To work around it,
1859 // the performance value could be cached for the duration of a compilation.
1860 float perf = 0;
1861 const size_t operationCount = mOperations.size();
1862 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1863 perf += getPerformance(preference, device, operationIndex);
1864 }
1865 return perf;
1866 }
1867
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const1868 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
1869 uint32_t operationIndex) const {
1870 auto applyPreference = [preference](const PerformanceInfo& perf) {
1871 return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
1872 };
1873
1874 const Operation& operation = getOperation(operationIndex);
1875
1876 if (operation.type == OperationType::IF) {
1877 namespace op = operation_if;
1878 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1879 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1880 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1881 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1882 return applyPreference(device->getIfPerformance()) +
1883 0.5 * (thenModel->getPerformance(preference, device) +
1884 elseModel->getPerformance(preference, device));
1885 }
1886
1887 if (operation.type == OperationType::WHILE) {
1888 namespace op = operation_while;
1889 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1890 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1891 const ModelBuilder* condModel = getReferencedModel(condOperand);
1892 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1893 return applyPreference(device->getWhilePerformance()) +
1894 condModel->getPerformance(preference, device) +
1895 bodyModel->getPerformance(preference, device);
1896 }
1897
1898 // TODO This assumes that the type is dictated by the first operand. This is
1899 // currently the case but is not a safe assumption to make in the long term.
1900 const uint32_t operandIndex = operation.inputs[0];
1901 const OperandType operandType = mOperands[operandIndex].type;
1902 switch (operandType) {
1903 case OperandType::FLOAT32:
1904 if (mRelaxComputationFloat32toFloat16) {
1905 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
1906 }
1907 break;
1908 case OperandType::TENSOR_FLOAT32:
1909 if (mRelaxComputationFloat32toFloat16) {
1910 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
1911 }
1912 break;
1913 default:
1914 break;
1915 }
1916
1917 return applyPreference(device->getPerformance(operandType));
1918 }
1919
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const1920 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
1921 auto containsUnknownSize = [](const ModelBuilder* model,
1922 const std::vector<uint32_t>& operandIndexes) {
1923 for (uint32_t operandIndex : operandIndexes) {
1924 if (hasUnknownSize(model->getOperand(operandIndex))) {
1925 return true;
1926 }
1927 }
1928 return false;
1929 };
1930
1931 const Operation& operation = getOperation(operationIndex);
1932
1933 if (operation.type == OperationType::IF) {
1934 namespace op = operation_if;
1935 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1936 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1937 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1938 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1939 return containsUnknownSize(this, operation.inputs) ||
1940 containsUnknownSize(this, operation.outputs) ||
1941 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
1942 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
1943 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
1944 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
1945 }
1946
1947 if (operation.type == OperationType::WHILE) {
1948 namespace op = operation_while;
1949 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1950 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1951 const ModelBuilder* condModel = getReferencedModel(condOperand);
1952 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1953 return containsUnknownSize(this, operation.inputs) ||
1954 containsUnknownSize(this, operation.outputs) ||
1955 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
1956 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
1957 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
1958 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
1959 }
1960
1961 // Not a control flow operation.
1962 return false;
1963 }
1964
supportedByControlFlowInterpreter(uint32_t operationIndex) const1965 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
1966 const Operation& operation = getOperation(operationIndex);
1967 return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
1968 // The partitioner does not support dynamic temporaries (b/132458982).
1969 !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
1970 }
1971
1972 namespace {
1973
1974 // This class determines whether a given device can execute a given operation
1975 class CanDo {
1976 public:
CanDo()1977 CanDo() {}
1978
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)1979 void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
1980 mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
1981 }
1982
check(size_t operationIndex) const1983 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1984
1985 private:
1986 std::vector<bool> mSupportsOperationByIndex;
1987 };
1988
1989 } // anonymous namespace
1990
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1991 int ModelBuilder::findBestDeviceForEachOperation(
1992 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1993 std::vector<int>* bestDeviceForOperation) const {
1994 const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());
1995
1996 const size_t deviceCount = devices.size();
1997 std::vector<CanDo> canDo(deviceCount);
1998 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1999 canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2000 }
2001
2002 // Figure out the best driver for each operation.
2003 const size_t operationCount = mOperations.size();
2004 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2005 const Operation& operation = getOperation(operationIndex);
2006 // Find which device, including CPU fallback, gives the best performance for this operation.
2007 int bestChoice = -1;
2008
2009 if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2010 // Do not schedule control flow operations with unknown size to
2011 // non-CPU devices because this is not supported by the 1.3 HAL.
2012 // See http://b/159076604#comment5.
2013 auto cpuDeviceIterator =
2014 std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2015 if (cpuDeviceIterator != devices.end()) {
2016 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2017 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2018 bestChoice = cpuDeviceIndex;
2019 }
2020 }
2021 } else {
2022 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
2023 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2024 const auto& device = devices[deviceIndex];
2025 if (canDo[deviceIndex].check(operationIndex)) {
2026 const float perfVal = getPerformance(preference, device, operationIndex);
2027 if (bestChoice < 0 || perfVal < bestPerfVal ||
2028 (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
2029 bestChoice = deviceIndex;
2030 bestPerfVal = perfVal;
2031 }
2032 } else {
2033 // Somewhat noisy logging, but only place where the user of NNAPI can get
2034 // feedback on why an operation was not run on a specific device.
2035 //
2036 // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2037 // very small.
2038 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2039 << toString(operation.type);
2040 }
2041 }
2042 }
2043
2044 if (bestChoice < 0) {
2045 LOG(ERROR) << "No driver can do operation " << toString(operation.type);
2046 return ANEURALNETWORKS_BAD_DATA;
2047 } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2048 supportedByControlFlowInterpreter(operationIndex)) {
2049 // Run control flow on the ExecutionPlan::next() interpreter and try
2050 // to delegate referenced models.
2051 const int kControlFlowInterpreter = deviceCount;
2052 (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2053 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2054 << toString(operation.type) << ") = -1"
2055 << " (NNAPI)";
2056 } else {
2057 (*bestDeviceForOperation)[operationIndex] = bestChoice;
2058 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2059 << toString(operation.type) << ") = " << bestChoice << " ("
2060 << devices[bestChoice]->getName() << ")";
2061 }
2062 }
2063 return ANEURALNETWORKS_NO_ERROR;
2064 }
2065
2066 } // namespace nn
2067 } // namespace android
2068