1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionBuilder"
18
19 #include "ExecutionBuilder.h"
20
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <LegacyUtils.h>
24 #include <Tracing.h>
25 #include <android-base/logging.h>
26 #include <nnapi/IBurst.h>
27 #include <nnapi/IPreparedModel.h>
28 #include <nnapi/Types.h>
29
30 #include <algorithm>
31 #include <limits>
32 #include <map>
33 #include <memory>
34 #include <mutex>
35 #include <optional>
36 #include <string>
37 #include <thread>
38 #include <tuple>
39 #include <utility>
40 #include <vector>
41
42 #include "BurstBuilder.h"
43 #include "CompilationBuilder.h"
44 #include "Manager.h"
45 #include "ModelArgumentInfo.h"
46 #include "ModelBuilder.h"
47 #include "TypeManager.h"
48
49 namespace android {
50 namespace nn {
51
52 // Partial validation of output shapes returned from driver, to ensure they
53 // conform to a very specific set of rules.
validateOutputShapesFromDriver(ErrorStatus executionStatus,const ModelBuilder * model,const std::vector<OutputShape> & shapes)54 static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
55 const std::vector<OutputShape>& shapes) {
56 // Enforces the following rules (some of which are from b/154054474):
57 // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
58 // If the vector is not empty, it must have as many entries as the step model has outputs.
59 // - If NONE, then either shapes vector is empty, or every shape is
60 // marked isSufficient and, if a tensor, has known rank.
61 // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry
62 // is marked !isSufficient.
63 switch (executionStatus) {
64 case ErrorStatus::NONE: {
65 NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
66 << "With execution ErrorStatus " << executionStatus
67 << " output shapes vector must be empty or of length " << model->outputCount()
68 << " but has length " << shapes.size();
69 NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
70 [](const OutputShape& shape) { return shape.isSufficient; }))
71 << "With execution ErrorStatus " << executionStatus
72 << " at least one output shape is unexpectedly marked !isSufficient";
73
74 const TypeManager* tm = TypeManager::get();
75 for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
76 ++outputIndex) {
77 const Operand& outputOperand = model->getOutputOperand(outputIndex);
78 NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
79 (shapes[outputIndex].dimensions.size() != 0))
80 << "With execution ErrorStatus " << executionStatus << " output#"
81 << outputIndex << " shape unexpectedly has zero rank";
82 }
83
84 break;
85 }
86 case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
87 NN_RET_CHECK(shapes.size() == model->outputCount())
88 << "With execution ErrorStatus " << executionStatus
89 << " output shapes vector must be of length " << model->outputCount()
90 << " but has length " << shapes.size();
91 NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
92 [](const OutputShape& shape) { return !shape.isSufficient; }))
93 << "With execution ErrorStatus " << executionStatus
94 << " at least one output shape must have been marked !isSufficient";
95 break;
96 }
97 default: {
98 NN_RET_CHECK(shapes.size() == 0)
99 << "With execution ErrorStatus " << executionStatus
100 << " output shapes vector must be empty but has length " << shapes.size();
101 break;
102 }
103 }
104 return true;
105 }
validateOutputShapesFromDriver(int executionResultCode,const ModelBuilder * model,const std::vector<OutputShape> & shapes)106 static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
107 const std::vector<OutputShape>& shapes) {
108 return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
109 model, shapes);
110 }
111
measureTiming(const ExecutionBuilder * execution)112 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
113 return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
114 }
115
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)116 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
117 const char* tag, bool allowUnspecified) {
118 if (newType != nullptr) {
119 const Extension::OperandTypeInformation* info = nullptr;
120 if (isExtension(operand.type)) {
121 NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
122 }
123 if (validateOperandType(*newType, info, tag, allowUnspecified) !=
124 ANEURALNETWORKS_NO_ERROR) {
125 LOG(ERROR) << tag << ": Invalid newType";
126 return false;
127 }
128 if (operand.dimensions.size() == 0) {
129 return true;
130 }
131 if (operand.dimensions.size() != newType->dimensionCount) {
132 LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = "
133 << operand.dimensions.size() << ", new = " << newType->dimensionCount << ")";
134 return false;
135 }
136 for (uint32_t i = 0; i < newType->dimensionCount; i++) {
137 if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
138 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
139 return false;
140 }
141 }
142 } else {
143 if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
144 tensorHasUnspecifiedDimensions(operand)) {
145 LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
146 return false;
147 }
148 }
149 return true;
150 }
151
ExecutionBuilder(const CompilationBuilder * compilation)152 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
153 : mCompilation(compilation),
154 mModel(compilation->mModel),
155 mPlan(&compilation->mPlan),
156 mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
157 mInputs(mModel->inputCount()),
158 mOutputs(mModel->outputCount()) {
159 VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
160 << " inputs and " << mOutputs.size() << " outputs";
161 }
162
SimpleExecutionBuilder(const CompilationBuilder * compilation)163 SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
164 : ExecutionBuilder(compilation) {
165 CHECK(mPlan->isSimple());
166 }
167
CompoundExecutionBuilder(const CompilationBuilder * compilation)168 CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
169 : ExecutionBuilder(compilation) {
170 CHECK(mPlan->isCompound());
171 }
172
getSourceModel(uint32_t index) const173 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
174 return mPlan->getSourceModels().getModel(index);
175 }
176
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)177 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
178 const void* buffer, size_t length) {
179 if (computationStarted()) {
180 LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
181 "execution has started.";
182 return ANEURALNETWORKS_BAD_STATE;
183 }
184 uint32_t count = static_cast<uint32_t>(mInputs.size());
185 if (index >= count) {
186 LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
187 return ANEURALNETWORKS_BAD_DATA;
188 }
189 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
190 "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
191 return ANEURALNETWORKS_BAD_DATA;
192 }
193 if (length > 0xFFFFFFFF) {
194 LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
195 return ANEURALNETWORKS_BAD_DATA;
196 }
197 uint32_t l = static_cast<uint32_t>(length);
198 if (!mInputs[index].unspecified()) {
199 LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
200 "provided";
201 return ANEURALNETWORKS_BAD_STATE;
202 }
203 int n;
204 std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
205 mModel->getInputOperand(index), type, const_cast<void*>(buffer), l,
206 mInputAndOutputPaddingEnabled);
207 mHasCalledSetInputOutput = true;
208 return n;
209 }
210
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)211 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
212 const RuntimeMemory* memory, size_t offset,
213 size_t length) {
214 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
215
216 if (computationStarted()) {
217 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
218 "execution has started.";
219 return ANEURALNETWORKS_BAD_STATE;
220 }
221 uint32_t count = static_cast<uint32_t>(mInputs.size());
222 if (index >= count) {
223 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
224 << count;
225 return ANEURALNETWORKS_BAD_DATA;
226 }
227 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
228 "ANeuralNetworksExecution_setInputFromMemory", false)) {
229 return ANEURALNETWORKS_BAD_DATA;
230 }
231 if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
232 length)) {
233 return ANEURALNETWORKS_BAD_DATA;
234 }
235 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
236 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
237 // region is used. We update the length here because the drivers are still expecting a real
238 // length. For other memories that do not allow this semantic, it is checked in
239 // MemoryValidatorBase::validate before reaching here.
240 if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
241 length = memory->getSize();
242 }
243 // TODO validate the rest
244 uint32_t poolIndex = mMemories.add(memory);
245 if (!mInputs[index].unspecified()) {
246 LOG(ERROR)
247 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
248 "been provided";
249 return ANEURALNETWORKS_BAD_STATE;
250 }
251 int n;
252 std::tie(n, mInputs[index]) =
253 ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex,
254 offset, length, mInputAndOutputPaddingEnabled);
255 mHasCalledSetInputOutput = true;
256 return n;
257 }
258
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)259 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
260 void* buffer, size_t length) {
261 if (computationStarted()) {
262 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
263 "execution has started.";
264 return ANEURALNETWORKS_BAD_STATE;
265 }
266 uint32_t count = static_cast<uint32_t>(mOutputs.size());
267 if (index >= count) {
268 LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
269 return ANEURALNETWORKS_BAD_DATA;
270 }
271 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
272 "ANeuralNetworksExecution_setOutput", true)) {
273 return ANEURALNETWORKS_BAD_DATA;
274 }
275 if (length > 0xFFFFFFFF) {
276 LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
277 return ANEURALNETWORKS_BAD_DATA;
278 }
279 uint32_t l = static_cast<uint32_t>(length);
280 if (!mOutputs[index].unspecified()) {
281 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
282 "provided";
283 return ANEURALNETWORKS_BAD_STATE;
284 }
285 int n;
286 std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer(
287 mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled);
288 mHasCalledSetInputOutput = true;
289 return n;
290 }
291
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)292 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
293 const RuntimeMemory* memory, size_t offset,
294 size_t length) {
295 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
296
297 if (computationStarted()) {
298 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
299 "execution has started.";
300 return ANEURALNETWORKS_BAD_STATE;
301 }
302 uint32_t count = static_cast<uint32_t>(mOutputs.size());
303 if (index >= count) {
304 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
305 << count;
306 return ANEURALNETWORKS_BAD_DATA;
307 }
308 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
309 "ANeuralNetworksExecution_setOutputFromMemory", true)) {
310 return ANEURALNETWORKS_BAD_DATA;
311 }
312 if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
313 length)) {
314 return ANEURALNETWORKS_BAD_DATA;
315 }
316 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
317 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
318 // region is used. We update the length here because the drivers are still expecting a real
319 // length. For other memories that do not allow this semantic, it is checked in
320 // MemoryValidatorBase::validate before reaching here.
321 if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
322 length = memory->getSize();
323 }
324 // TODO validate the rest
325 uint32_t poolIndex = mMemories.add(memory);
326 if (!mOutputs[index].unspecified()) {
327 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
328 "already been provided";
329 return ANEURALNETWORKS_BAD_STATE;
330 }
331 int n;
332 std::tie(n, mOutputs[index]) =
333 ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex,
334 offset, length, mInputAndOutputPaddingEnabled);
335 mHasCalledSetInputOutput = true;
336 return n;
337 }
338
setMeasureTiming(bool measure)339 int ExecutionBuilder::setMeasureTiming(bool measure) {
340 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
341 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
342 << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
343 << "that was not created by ANeuralNetworksCompilation_createForDevices "
344 << "with numDevices = 1";
345 return ANEURALNETWORKS_BAD_DATA;
346 }
347 if (computationStarted()) {
348 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
349 "execution has started.";
350 return ANEURALNETWORKS_BAD_STATE;
351 }
352 mMeasureTiming = measure;
353 return ANEURALNETWORKS_NO_ERROR;
354 }
355
getDuration(int32_t durationCode,uint64_t * duration) const356 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
357 if (!completed()) {
358 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
359 "execution has finished.";
360 *duration = UINT64_MAX;
361 return ANEURALNETWORKS_BAD_STATE;
362 }
363 if (completedWith() != Completion::NO_ERROR) {
364 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
365 "that has encountered an error.";
366 *duration = UINT64_MAX;
367 return ANEURALNETWORKS_BAD_STATE;
368 }
369
370 if (!mMeasureTiming) {
371 *duration = UINT64_MAX;
372 return ANEURALNETWORKS_BAD_STATE;
373 }
374
375 Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
376 Timing timingFenced = timingLaunched;
377 if (mFencedExecutionCallback != nullptr) {
378 auto result = mFencedExecutionCallback();
379 if (!result.has_value()) {
380 LOG(ERROR) << "Fenced execution callback failed: " << result.error().message;
381 *duration = UINT64_MAX;
382 return ANEURALNETWORKS_BAD_STATE;
383 }
384 std::tie(timingLaunched, timingFenced) = std::move(result).value();
385 }
386 const OptionalDuration selectedDuration = [durationCode, &timingLaunched,
387 &timingFenced]() -> OptionalDuration {
388 switch (durationCode) {
389 case ANEURALNETWORKS_DURATION_ON_HARDWARE:
390 return timingLaunched.timeOnDevice;
391 case ANEURALNETWORKS_DURATION_IN_DRIVER:
392 return timingLaunched.timeInDriver;
393 case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
394 return timingFenced.timeOnDevice;
395 case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
396 return timingFenced.timeInDriver;
397 default:
398 LOG(FATAL) << "unexpected";
399 return std::nullopt;
400 }
401 }();
402 if (selectedDuration.has_value()) {
403 constexpr uint64_t kMaxTiming = std::numeric_limits<uint64_t>::max() - 1;
404 using CommonType = std::common_type_t<Duration::rep, uint64_t>;
405 const auto count = std::min<CommonType>(selectedDuration.value().count(), kMaxTiming);
406 *duration = static_cast<uint64_t>(count);
407 } else {
408 constexpr uint64_t kNoTiming = std::numeric_limits<uint64_t>::max();
409 *duration = kNoTiming;
410 }
411
412 VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
413 return ANEURALNETWORKS_NO_ERROR;
414 }
415
setTimeoutDuration(uint64_t duration)416 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
417 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
418 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
419 "created from an ANeuralNetworksCompilation that was not created by "
420 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
421 return ANEURALNETWORKS_BAD_DATA;
422 }
423 if (computationStarted()) {
424 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
425 return ANEURALNETWORKS_BAD_STATE;
426 }
427 if (duration > 0) {
428 mTimeoutDuration = duration;
429 } else {
430 mTimeoutDuration.reset();
431 }
432 return ANEURALNETWORKS_NO_ERROR;
433 }
434
getTimeoutDuration() const435 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
436 return mTimeoutDuration;
437 }
438
setLoopTimeout(uint64_t duration)439 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
440 if (computationStarted()) {
441 LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
442 "execution has started.";
443 return ANEURALNETWORKS_BAD_STATE;
444 }
445 if (duration > operation_while::kTimeoutNsMaximum) {
446 LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
447 << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
448 duration = operation_while::kTimeoutNsMaximum;
449 }
450 mLoopTimeoutDuration = duration;
451 return ANEURALNETWORKS_NO_ERROR;
452 }
453
enableInputAndOutputPadding(bool enable)454 int ExecutionBuilder::enableInputAndOutputPadding(bool enable) {
455 if (computationStarted()) {
456 LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the "
457 "execution has started.";
458 return ANEURALNETWORKS_BAD_STATE;
459 }
460 if (mHasCalledSetInputOutput) {
461 LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input "
462 "or output is set.";
463 return ANEURALNETWORKS_BAD_STATE;
464 }
465 mInputAndOutputPaddingEnabled = enable;
466 return ANEURALNETWORKS_NO_ERROR;
467 }
468
setReusable(bool reusable)469 int ExecutionBuilder::setReusable(bool reusable) {
470 if (computationStarted()) {
471 LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the "
472 "execution has started.";
473 return ANEURALNETWORKS_BAD_STATE;
474 }
475 mReusable = reusable;
476 return ANEURALNETWORKS_NO_ERROR;
477 }
478
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)479 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
480 if (!completed()) {
481 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
482 "execution has finished.";
483 return ANEURALNETWORKS_BAD_STATE;
484 }
485 if (completedWith() == Completion::OTHER_ERROR) {
486 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
487 "that has encountered an error.";
488 return ANEURALNETWORKS_BAD_STATE;
489 }
490
491 uint32_t count = static_cast<uint32_t>(mOutputs.size());
492 if (index >= count) {
493 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
494 << " " << count;
495 return ANEURALNETWORKS_BAD_DATA;
496 }
497 const auto& dims = mOutputs[index].dimensions();
498 if (dims.empty()) {
499 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
500 "dimensions of a scalar";
501 return ANEURALNETWORKS_BAD_DATA;
502 }
503 std::copy(dims.begin(), dims.end(), dimensions);
504 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
505 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
506 }
507
getOutputOperandRank(uint32_t index,uint32_t * rank)508 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
509 if (!completed()) {
510 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
511 "execution has finished.";
512 return ANEURALNETWORKS_BAD_STATE;
513 }
514 if (completedWith() == Completion::OTHER_ERROR) {
515 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
516 "that has encountered an error.";
517 return ANEURALNETWORKS_BAD_STATE;
518 }
519 uint32_t count = static_cast<uint32_t>(mOutputs.size());
520 if (index >= count) {
521 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
522 << count;
523 return ANEURALNETWORKS_BAD_DATA;
524 }
525 *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
526 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
527 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
528 }
529
checkAndSetComputationState(const char * name)530 bool ExecutionBuilder::checkAndSetComputationState(const char* name) {
531 std::lock_guard<std::mutex> lock(mStateMutex);
532 if (!mReusable && mState == State::COMPLETED) {
533 LOG(ERROR) << "ANeuralNetworksExecution_" << name
534 << " called on a non-reusable execution that has already completed";
535 return false;
536 }
537 if (mState == State::COMPUTATION) {
538 LOG(ERROR) << "ANeuralNetworksExecution_" << name
539 << " called on an execution that has already started";
540 return false;
541 }
542 mState = State::COMPUTATION;
543 return true;
544 }
545
546 // TODO(b/132321855): validate that we have full types for all inputs and outputs,
547 // that the graph is not cyclic,
validateRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs)548 static int validateRequest(const std::vector<ModelArgumentInfo>& inputs,
549 const std::vector<ModelArgumentInfo>& outputs) {
550 for (auto& p : inputs) {
551 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
552 LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified";
553 return ANEURALNETWORKS_BAD_DATA;
554 }
555 }
556 for (auto& p : outputs) {
557 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
558 LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified";
559 return ANEURALNETWORKS_BAD_DATA;
560 }
561 }
562 return ANEURALNETWORKS_NO_ERROR;
563 }
564
getValidationResultCode()565 int ExecutionBuilder::getValidationResultCode() {
566 if (!mValidationResultCode.has_value()) {
567 mValidationResultCode = validateRequest(mInputs, mOutputs);
568 }
569 return mValidationResultCode.value();
570 }
571
areOutputsFullySpecified()572 bool ExecutionBuilder::areOutputsFullySpecified() {
573 if (!mOutputsFullySpecified.has_value()) {
574 mOutputsFullySpecified = true;
575 for (uint32_t i = 0; i < mOutputs.size(); i++) {
576 if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
577 TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) &&
578 tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type,
579 mOutputs[i].initialDimensions())) {
580 mOutputsFullySpecified = false;
581 break;
582 }
583 }
584 }
585 return mOutputsFullySpecified.value();
586 }
587
prepareForCompute(const char * name)588 int ExecutionBuilder::prepareForCompute(const char* name) {
589 if (!checkAndSetComputationState(name)) {
590 return ANEURALNETWORKS_BAD_STATE;
591 }
592 if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) {
593 return finishComputation(n, {});
594 }
595 return ANEURALNETWORKS_NO_ERROR;
596 }
597
598 // Attempt synchronous execution of full model on CPU.
599 // TODO: How should we handle timing in this case?
600 // For Q this is irrelevant: We only support timing in conjunction
601 // with an explicit device list; and we do not support CPU fallback
602 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)603 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
604 ExecutionBuilder* executionBuilder) {
605 CHECK(executionBuilder != nullptr);
606 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
607 VLOG(EXECUTION) << "cpuFallbackFull";
608
609 // Get fallback executor.
610 StepExecutor executor(executionBuilder, executionBuilder->getModel(),
611 DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr,
612 /*reusable=*/false);
613 executor.mapInputsAndOutputsTrivially();
614
615 // Attempt fallback execution.
616 return executor.computeOnCpuFallback();
617 }
618
619 // Attempt synchronous execution on CPU.
620 // TODO: How should we handle timing in this case?
621 // For Q this is irrelevant: We only support timing in conjunction
622 // with an explicit device list; and we do not support CPU fallback
623 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
624 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)625 cpuFallbackPartial(const ExecutionPlan& plan,
626 std::shared_ptr<ExecutionPlan::Controller> controller) {
627 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
628 VLOG(EXECUTION) << "cpuFallbackPartial";
629
630 // Get fallback executor.
631 std::shared_ptr<StepExecutor> executor;
632 int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
633 if (n1 != ANEURALNETWORKS_NO_ERROR) {
634 return {n1, {}, {}, nullptr};
635 }
636 CHECK(executor != nullptr);
637
638 // Attempt fallback execution.
639 auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
640 return {n2, std::move(outputShapes), timing, executor};
641 }
642
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)643 std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
644 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
645 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
646 VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
647
648 if (mExecutor == nullptr) {
649 mExecutor = mPlan->makeStepExecutor(mReusable, this);
650 }
651
652 auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
653 auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController);
654
655 if (n == ANEURALNETWORKS_NO_ERROR) {
656 return {n, std::move(outputShapes), timing};
657 }
658
659 // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
660 if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
661 return {n, std::move(outputShapes), {}};
662 }
663
664 // If CPU fallback is not allowed and there was an error, end execution.
665 if (!mAllowCpuFallback) {
666 return {n, {}, {}};
667 }
668
669 // If CPU execution was already attempted, do not perform CPU fallback.
670 if (mExecutor->isCpu()) {
671 return {n, {}, {}};
672 }
673
674 // If the code has reached this point, a potentially recoverable error
675 // occurred during the execution. Do an execution fallback on the CPU.
676 return cpuFallbackFull(this);
677 }
678
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)679 std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
680 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
681 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
682 VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
683
684 auto controller = mPlan->makeController(this, burstBuilder);
685 std::vector<OutputShape> outputShapes = getInitialOutputShapes();
686
687 // On this iteration, do I need to repeat the previous step because it
688 // reported insufficient size?
689 bool doInsufficientSizeFallback = false;
690
691 while (true) {
692 VLOG(EXECUTION) << "looking for next StepExecutor";
693
694 // Get the current step of the execution.
695 std::shared_ptr<StepExecutor> executor;
696 SharedBurst burstController;
697 int n = doInsufficientSizeFallback
698 ? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
699 : mPlan->next(controller, &executor, &burstController, &outputShapes);
700 doInsufficientSizeFallback = false;
701 if (n != ANEURALNETWORKS_NO_ERROR) {
702 // During the interpreted execution of control flow, a loop timeout
703 // might occur in ExecutionPlan::next().
704 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
705 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
706 if (mAllowCpuFallback && !missedDeadline) break;
707 return {n, {}, {}};
708 }
709
710 // If the code reached the end of the plan without error, then return
711 // with no error.
712 if (executor == nullptr) {
713 return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
714 }
715 const bool executorIsCpu = executor->isCpu();
716
717 // Attempt to execute a single step of the execution.
718 auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
719
720 // Update global outputs and dynamic temporaries.
721 StepExecutor::UpdateOutputShapes updateOutputShapes = {};
722 if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
723 &updateOutputShapes)) {
724 stepN = ANEURALNETWORKS_OP_FAILED;
725 }
726
727 // If execution was successful, continue to next step.
728 if (stepN == ANEURALNETWORKS_NO_ERROR) {
729 if (updateOutputShapes.zeroSizedInput) {
730 // We'll need to do full model CPU fallback
731 VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
732 stepN = ANEURALNETWORKS_OP_FAILED;
733 } else {
734 CHECK(executor->areDynamicTemporariesAllocated());
735 continue;
736 }
737 }
738
739 if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
740 VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
741 if (updateOutputShapes.mainOutputInsufficient ||
742 !updateOutputShapes.updatedDynamicTemporary) {
743 // Either:
744 // - At least one main model output is not of sufficient size; or
745 // - we didn't learn anything new about dynamic temporaries.
746 // Neither of these is recoverable, so end execution.
747 return {stepN, outputShapes, {}};
748 }
749 // Every main model output is of sufficient size. This implies that
750 // at least one dynamic temporary is not of sufficient size. This
751 // is recoverable.
752 doInsufficientSizeFallback = true;
753 continue;
754 }
755
756 // If CPU fallback is not allowed and there was an error, end execution.
757 if (!mAllowCpuFallback) {
758 return {stepN, {}, {}};
759 }
760
761 // If CPU execution was already attempted, perform a full CPU fallback.
762 if (executorIsCpu) {
763 break;
764 }
765
766 // If the code reaches this point, attempt a partial fallback to CPU.
767 CHECK(mAllowCpuFallback);
768 if (updateOutputShapes.zeroSizedInput) {
769 // Do not attempt a partial fallback.
770 break;
771 }
772 while (true) {
773 auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
774 cpuFallbackPartial(*mPlan, controller);
775
776 // Update global outputs and dynamic temporaries.
777 StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
778 if (fallbackExecutor != nullptr &&
779 !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
780 &outputShapes, &fallbackUpdateOutputShapes)) {
781 fallbackN = ANEURALNETWORKS_OP_FAILED;
782 }
783
784 // If execution was successful, continue to next step.
785 if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
786 if (fallbackUpdateOutputShapes.zeroSizedInput) {
787 // We'll need to do full model CPU fallback
788 VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
789 fallbackN = ANEURALNETWORKS_OP_FAILED;
790 break;
791 }
792 CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
793 goto nextStep;
794 }
795
796 if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
797 VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
798 << toString(fallbackUpdateOutputShapes);
799 if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
800 !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
801 // Either:
802 // - At least one main model output is not of sufficient size; or
803 // - we didn't learn anything new about dynamic temporaries.
804 // Neither of these is recoverable, so end execution.
805 return {fallbackN, outputShapes, {}};
806 }
807 // Every main model output is of sufficient size. This implies
808 // that at least one dynamic temporary is not of sufficient
809 // size. This is recoverable.
810 continue;
811 }
812
813 // If the code reaches this point, then there was an error with the
814 // fallback. In this case, attempt full fallback.
815 break;
816 }
817
818 // If the code reaches this point, then there was an error with the
819 // fallback. In this case, attempt full fallback.
820 break;
821
822 nextStep:
823 // Bottom of the outer loop
824 continue;
825 }
826
827 // If the code has reached this point, a potentially recoverable error
828 // occurred during the step executions. Instead, do a full execution
829 // fallback on the CPU.
830 return cpuFallbackFull(this);
831 }
832
waitForSyncFences(const std::vector<int> & waitFor)833 static bool waitForSyncFences(const std::vector<int>& waitFor) {
834 for (int syncFd : waitFor) {
835 if (syncFd > 0) {
836 auto r = syncWait(syncFd, -1);
837 if (r != FenceState::SIGNALED) {
838 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
839 return false;
840 }
841 }
842 }
843 return true;
844 }
845
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)846 std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
847 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
848 const OptionalTimePoint& deadline) {
849 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
850 VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
851
852 if (mExecutor == nullptr) {
853 mExecutor = mPlan->makeStepExecutor(mReusable, this);
854 }
855
856 auto [n, syncFd, callback] =
857 mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
858
859 if (n == ANEURALNETWORKS_NO_ERROR) {
860 return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
861 }
862
863 // If CPU fallback is not allowed and there was an error, end execution.
864 if (!mAllowCpuFallback) {
865 return {n, -1, nullptr};
866 }
867
868 // If CPU execution was already attempted, return from the function with an error.
869 if (mExecutor->isCpu()) {
870 return {n, -1, nullptr};
871 }
872
873 // If the code has reached this point, a potentially recoverable error
874 // occurred during the step executions. Instead, do a full execution
875 // fallback on the CPU.
876 VLOG(EXECUTION) << "Performing full fallback on the CPU.";
877 if (!waitForSyncFences(waitFor)) {
878 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
879 }
880 auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
881 reportTimingWithoutFencedExecutionCallback(fallbackTiming);
882 return {fallbackN, -1, nullptr};
883 }
884
885 // In case of partitioned execution, computeFencedInternal call will return the sync
886 // fence and the fenced compute callback returned from the last partition.
887 // Any failed partition will result in whole execution fallback to CPU if
888 // mAllowCpuFallback is set to true.
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)889 std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
890 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
891 const OptionalTimePoint& deadline) {
892 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
893 VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
894
895 // We should have detected this earlier in the call chain and fallen back to
896 // non-fenced execution. This is an implementation limitation: In order to
897 // support dynamic temporarires in this code, we'd need to implement
898 // something like the following:
899 // - If a partition has outputs of unknown size, compute that partition in a
900 // non fenced fashion, just as if it were scheduled on a driver that does
901 // not support fenced execution.
902 // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
903 // that handles a step execution that fails with
904 // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
905 CHECK(!mCompilation->hasDynamicTemporaries());
906
907 // Initiate waitForFds, syncFence for the first step.
908 std::vector<int> waitForFds = waitFor;
909 base::unique_fd syncFence;
910 ExecuteFencedInfoCallback executeFencedInfoCallback;
911
912 std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
913 while (true) {
914 VLOG(EXECUTION) << "looking for next StepExecutor";
915
916 // Get the current step of the execution.
917 std::shared_ptr<StepExecutor> executor;
918 int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get());
919 if (n != ANEURALNETWORKS_NO_ERROR) {
920 // During the interpreted execution of control flow, a loop timeout
921 // might occur in ExecutionPlan::next().
922 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
923 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
924 if (mAllowCpuFallback && !missedDeadline) break;
925 // Return -1 for the sync fence fd, and nullptr for the callback.
926 return {n, -1, nullptr};
927 }
928
929 // If the code reached the end of the plan without error, then return
930 // with no error.
931 if (executor == nullptr) {
932 return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback};
933 }
934
935 // Attempt to compute a single step of the execution.
936 auto [stepN, syncFd, callback] =
937 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
938
939 // Update waitForFds, syncFence for the next step.
940 syncFence.reset(syncFd);
941 executeFencedInfoCallback = callback;
942 waitForFds.clear();
943 if (syncFd >= 0) {
944 waitForFds = {syncFd};
945 }
946
947 // If execution was successful, continue to next step.
948 if (stepN == ANEURALNETWORKS_NO_ERROR) {
949 continue;
950 }
951 // If CPU fallback is not allowed and there was an error, end execution.
952 if (!mAllowCpuFallback) {
953 return {stepN, -1, nullptr};
954 }
955
956 // If the code reaches this point, then there was an error with the
957 // fallback. In this case, attempt full fallback.
958 break;
959 }
960
961 // If the code has reached this point, a potentially recoverable error
962 // occurred during the step executions. Instead, do a full execution
963 // fallback on the CPU.
964 VLOG(EXECUTION) << "Performing full fallback on the CPU.";
965 if (!waitForSyncFences(waitFor)) {
966 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
967 }
968 auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
969 return {fullN, -1, nullptr};
970 }
971
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)972 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
973 uint64_t timeoutDurationAfterFence, int* syncFence) {
974 CHECK(syncFence != nullptr);
975 NN_RETURN_IF_ERROR(prepareForCompute("startComputeWithDependencies"));
976 if (timeoutDurationAfterFence > 0) {
977 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
978 LOG(ERROR)
979 << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
980 "duration on an ANeuralNetworksExecution "
981 "created from an ANeuralNetworksCompilation that was not created by "
982 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
983 return finishComputation(ANEURALNETWORKS_BAD_DATA, {});
984 }
985 }
986 if (!areOutputsFullySpecified()) {
987 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
988 " not all outputs have fully specified dimensions";
989 return finishComputation(ANEURALNETWORKS_BAD_DATA, {});
990 }
991
992 // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
993 // fenced executions do not support dynamic output shape.
994
995 VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
996 int result;
997 const auto deadline = makeDeadline(mTimeoutDuration);
998 std::tie(result, *syncFence, mFencedExecutionCallback) =
999 computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
1000 // If there is an error, call finishComputation to mark the computation as completed.
1001 // Otherwise, we will call finishComputation in SyncFenceEvent::wait().
1002 if (result != ANEURALNETWORKS_NO_ERROR) {
1003 // TODO(miaowang): support dynamic output shape only with memory domain.
1004 // For now just return empty output shapes.
1005 result = finishComputation(result, {});
1006 }
1007 return result;
1008 }
1009
compute(std::shared_ptr<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)1010 int ExecutionBuilder::compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
1011 BurstBuilder* burstBuilder) {
1012 CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
1013 << "synchronizationCallback and burstBuilder cannot simultaneously be used";
1014
1015 const bool synchronous = (synchronizationCallback == nullptr);
1016 if (!synchronous) {
1017 *synchronizationCallback = nullptr;
1018 }
1019
1020 const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
1021 NN_RETURN_IF_ERROR(prepareForCompute(name));
1022
1023 // Validate input memory dimensions. We need to do the validation in every computation because
1024 // the memory dimensions may change between computations.
1025 for (auto& p : mInputs) {
1026 if (p.state() == ModelArgumentInfo::MEMORY) {
1027 const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex];
1028 if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
1029 return finishComputation(ANEURALNETWORKS_OP_FAILED, {});
1030 }
1031 }
1032 }
1033
1034 // Reset output dimensions.
1035 if (!areOutputsFullySpecified()) {
1036 for (auto& output : mOutputs) {
1037 output.reset();
1038 }
1039 }
1040
1041 const auto deadline = makeDeadline(mTimeoutDuration);
1042 if (synchronous) {
1043 if (burstBuilder) {
1044 VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
1045 } else {
1046 VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
1047 }
1048 const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
1049 if (mMeasureTiming) {
1050 mTimingWithoutFencedExecutionCallback = timing;
1051 }
1052 return finishComputation(n, outputShapes);
1053 } else /* asynchronous */ {
1054 // TODO: For asynchronous execution, entire plan-based-path should run in an
1055 // asynchronous thread -- take the asynchronous thread logic out of
1056 // CpuExecution::compute() and use it to wrap the plan-based-path.
1057
1058 // TODO: use a thread pool
1059 // TODO(mikie): this could have NNTRACE so we could measure the overhead
1060 // of spinning up a new thread.
1061
1062 // Prepare the callback for asynchronous execution.
1063 // std::shared_ptr<ExecutionCallback> object is returned when the
1064 // execution has been successfully launched, otherwise a
1065 // nullptr is returned. The executionCallback is
1066 // abstracted in the NN API as an "event".
1067 auto executionCallback = std::make_shared<ExecutionCallback>();
1068 executionCallback->setOnFinish(
1069 [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
1070 return finishComputation(error, outputShapes);
1071 });
1072 const auto asyncStartCompute = [this, deadline, executionCallback] {
1073 const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
1074 const auto status = convertResultCodeToErrorStatus(n);
1075 executionCallback->notify(status, outputShapes, timing);
1076 };
1077 if (DeviceManager::get()->syncExecRuntime()) {
1078 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
1079 asyncStartCompute();
1080 } else {
1081 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
1082 std::thread asyncExecution(asyncStartCompute);
1083 executionCallback->bindThread(std::move(asyncExecution));
1084 }
1085 *synchronizationCallback = executionCallback;
1086 return ANEURALNETWORKS_NO_ERROR;
1087 }
1088 }
1089
getInitialOutputShapes() const1090 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
1091 std::vector<OutputShape> outputShapes(mOutputs.size());
1092 std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
1093 [](const auto& x) -> OutputShape {
1094 std::vector<uint32_t> dimensions;
1095 if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
1096 dimensions = x.dimensions();
1097 }
1098 return {.dimensions = std::move(dimensions), .isSufficient = true};
1099 });
1100 return outputShapes;
1101 }
1102
1103 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
1104 // have no lower a specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)1105 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
1106 if (to.size() == 0) return true;
1107 NN_RET_CHECK_EQ(to.size(), from.size());
1108 for (uint32_t i = 0; i < to.size(); i++) {
1109 NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
1110 }
1111 return true;
1112 }
1113
isZeroSizedTensor(int executionResultCode,const OutputShape & outputShape)1114 static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
1115 return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
1116 outputShape.dimensions.size() &&
1117 (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
1118 outputShape.dimensions.end());
1119 }
1120
updateOutputShapes(ErrorStatus status,const std::vector<OutputShape> & outputShapes)1121 bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
1122 const std::vector<OutputShape>& outputShapes) {
1123 NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
1124
1125 if (outputShapes.size() == 0) {
1126 return true;
1127 }
1128 NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
1129 for (uint32_t i = 0; i < outputShapes.size(); i++) {
1130 // Check if only unspecified dimensions or rank are overwritten.
1131 NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
1132 const OperandType operandType = mModel->getOutputOperand(i).type;
1133 NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
1134 outputShapes[i].dimensions));
1135 }
1136 for (uint32_t i = 0; i < outputShapes.size(); i++) {
1137 mOutputs[i].dimensions() = outputShapes[i].dimensions;
1138 mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
1139 }
1140 return true;
1141 }
1142
updateMemories()1143 bool ExecutionBuilder::updateMemories() {
1144 for (const auto& output : mOutputs) {
1145 if (output.state() != ModelArgumentInfo::MEMORY) continue;
1146 const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1147 NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
1148 }
1149 return true;
1150 }
1151
finishComputation(int result,const std::vector<OutputShape> & outputShapes)1152 int ExecutionBuilder::finishComputation(int result, const std::vector<OutputShape>& outputShapes) {
1153 const auto status = convertResultCodeToErrorStatus(result);
1154 if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
1155 result = ANEURALNETWORKS_OP_FAILED;
1156 }
1157 bool success = result == ANEURALNETWORKS_NO_ERROR;
1158 for (const auto& output : mOutputs) {
1159 if (output.state() != ModelArgumentInfo::MEMORY) continue;
1160 const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1161 memory->getValidator().setInitialized(success);
1162 }
1163 switch (result) {
1164 case ANEURALNETWORKS_NO_ERROR:
1165 mCompletion = Completion::NO_ERROR;
1166 break;
1167 case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
1168 mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE;
1169 break;
1170 default:
1171 mCompletion = Completion::OTHER_ERROR;
1172 break;
1173 }
1174 {
1175 std::lock_guard<std::mutex> lock(mStateMutex);
1176 CHECK(mState != State::PREPARATION)
1177 << "ExecutionBuilder::finishComputation is called in the preparation state";
1178 CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice";
1179 mState = State::COMPLETED;
1180 }
1181 return result;
1182 }
1183
toString(StepExecutor::UpdateOutputShapes updateOutputShapes)1184 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
1185 return "{ .updatedDynamicTemporary = " +
1186 std::to_string(updateOutputShapes.updatedDynamicTemporary) +
1187 ", .mainOutputInsufficient = " +
1188 std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
1189 }
1190
updateOutputShapes(int executionResultCode,const std::vector<OutputShape> & from,std::vector<OutputShape> * to,UpdateOutputShapes * update)1191 bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
1192 std::vector<OutputShape>* to, UpdateOutputShapes* update) {
1193 CHECK(update != nullptr);
1194 *update = {.updatedDynamicTemporary = false,
1195 .mainOutputInsufficient = false,
1196 .zeroSizedInput = false};
1197
1198 NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
1199
1200 if (from.size() == 0) {
1201 return true;
1202 }
1203
1204 if (VLOG_IS_ON(EXECUTION)) {
1205 for (const auto& shape : from) {
1206 VLOG(EXECUTION) << "updateOutputShapes: " << shape;
1207 }
1208 }
1209
1210 if (mExecutionStep != nullptr) {
1211 const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
1212 NN_RET_CHECK_LE(indexMapping.size(), from.size());
1213 for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
1214 const uint32_t toIndex = indexMapping[i];
1215 NN_RET_CHECK_GT(to->size(), toIndex);
1216 NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
1217 (*to)[toIndex] = from[i];
1218 update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
1219 if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
1220 isZeroSizedTensor(executionResultCode, from[i])) {
1221 update->zeroSizedInput = true;
1222 }
1223 }
1224
1225 if (!mDynamicTemporaries->empty()) {
1226 // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
1227 std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
1228 for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
1229 operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
1230 }
1231
1232 const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
1233 for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
1234 const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
1235 const auto it =
1236 operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
1237 if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
1238 continue;
1239 }
1240 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
1241 VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
1242 << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
1243 << sourceOperandIndex.second << ") is a dynamic temporary";
1244 // This is a temporary, but it might not be a dynamic temporary.
1245 const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
1246 if (loc == std::nullopt) {
1247 continue;
1248 }
1249 NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
1250 bool changedShape = false;
1251 const uint32_t actualSize = TypeManager::get()->getSizeOfData(
1252 mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
1253 if (actualSize > 0) {
1254 changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
1255 from[i].dimensions, actualSize);
1256 } else if (!from[i].isSufficient) {
1257 NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2)
1258 << "output#" << i << " paddedLength overflow";
1259 changedShape = mDynamicTemporaries->redeclare(
1260 sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength);
1261 } else {
1262 // The combination of not-fully-specified dimensions
1263 // and isSufficient means that we have no
1264 // information about whether the size of the dynamic
1265 // temporary is adequate.
1266 VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
1267 if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
1268 NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
1269 // This is a zero-sized tensor, and by
1270 // definition, any dynamic temporary is an input
1271 // to an execution step.
1272 update->zeroSizedInput = true;
1273 }
1274 }
1275 if (changedShape) {
1276 // TODO: find a better place for this comment.
1277 //
1278 // isUpdatable(a, b) imposes a partial ordering a <=
1279 // b. Every fully specified dimensions vector is an
1280 // upper bound of that ordering. Therefore, any
1281 // change in dimensions moves towards an upper
1282 // bound, and hence there are a finite number of
1283 // such changes possible.
1284 //
1285 // actualSize can only be computed from dimensions
1286 // that are an upper bound. Therefore, once
1287 // actualSize is computed, it will not change.
1288 //
1289 // If dimensions are not fully specified, and
1290 // estimated size changes, it increases. There is
1291 // an upper bound on estimated size to avoid
1292 // overflow.
1293 //
1294 // Therefore, if we retry only when dimensions or
1295 // size chage, and we stop retrying if we would
1296 // otherwise overflow, we should only retry a finite
1297 // number of times.
1298 update->updatedDynamicTemporary = true;
1299 }
1300 }
1301 mDynamicTemporaries->vlogDump("finished updateOutputShapes");
1302 }
1303 } else {
1304 NN_RET_CHECK_EQ(from.size(), to->size());
1305 for (uint32_t i = 0, e = from.size(); i < e; i++) {
1306 NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
1307 (*to)[i] = from[i];
1308 }
1309 }
1310 return true;
1311 }
1312
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<RuntimePreparedModel> preparedModel,bool reusable,const ExecutionStep * step,DynamicTemporaries * dynamicTemporaries)1313 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
1314 std::shared_ptr<Device> device,
1315 std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
1316 const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries)
1317 : mExecutionBuilder(executionBuilder),
1318 mExecutionStep(step),
1319 mDynamicTemporaries(dynamicTemporaries),
1320 mModel(model),
1321 mDevice(device),
1322 mPreparedModel(preparedModel),
1323 mInputs(model->inputCount()),
1324 mOutputs(model->outputCount()),
1325 mReusable(reusable) {
1326 CHECK(mDevice != nullptr);
1327 CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
1328 CHECK(!(reusable && dynamicTemporaries != nullptr));
1329 VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
1330 << mOutputs.size() << " outputs";
1331 }
1332
areDynamicTemporariesAllocated() const1333 bool StepExecutor::areDynamicTemporariesAllocated() const {
1334 return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
1335 }
1336
mapInputsAndOutputsTrivially()1337 void StepExecutor::mapInputsAndOutputsTrivially() {
1338 mInputs = mExecutionBuilder->mInputs;
1339 mOutputs = mExecutionBuilder->mOutputs;
1340 mMemories = mExecutionBuilder->mMemories;
1341 }
1342
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput,const Dimensions * builderDimensions)1343 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1344 ModelArgumentInfo* executorInputOrOutput,
1345 const Dimensions* builderDimensions) {
1346 auto updateDimensions = [executorInputOrOutput, builderDimensions] {
1347 if (!builderDimensions) {
1348 return;
1349 }
1350 executorInputOrOutput->dimensions() = *builderDimensions;
1351 };
1352
1353 *executorInputOrOutput = builderInputOrOutput;
1354 switch (executorInputOrOutput->state()) {
1355 default:
1356 CHECK(false) << "unexpected ModelArgumentInfo::state";
1357 break;
1358 case ModelArgumentInfo::HAS_NO_VALUE:
1359 case ModelArgumentInfo::UNSPECIFIED:
1360 break;
1361 case ModelArgumentInfo::POINTER:
1362 updateDimensions();
1363 break;
1364 case ModelArgumentInfo::MEMORY: {
1365 updateDimensions();
1366 const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1367 const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1368 const uint32_t executorPoolIndex = mMemories.add(memory);
1369 executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1370 break;
1371 }
1372 }
1373 }
1374
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const RuntimeMemory * memory,uint32_t offset,uint32_t length,const Dimensions & dimensions,ModelArgumentInfo * inputOrOutputInfo)1375 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1376 const RuntimeMemory* memory, uint32_t offset,
1377 uint32_t length, const Dimensions& dimensions,
1378 ModelArgumentInfo* inputOrOutputInfo) {
1379 // Should be similar to
1380 // ExecutionBuilder::setInputFromMemory()
1381 // ExecutionBuilder::setOutputFromMemory()
1382
1383 uint32_t poolIndex = mMemories.add(memory);
1384 CHECK(inputOrOutputInfo->unspecified());
1385 int n;
1386 std::tie(n, *inputOrOutputInfo) =
1387 ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1388 /*type=*/nullptr, poolIndex, offset, length);
1389 if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
1390 CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
1391 inputOrOutputInfo->dimensions() = dimensions;
1392 }
1393 return n;
1394 }
1395
toString(std::vector<uint32_t> dimensions)1396 static std::string toString(std::vector<uint32_t> dimensions) {
1397 std::string ret = "(";
1398 bool wroteOne = false;
1399 for (uint32_t dimension : dimensions) {
1400 if (wroteOne) {
1401 ret += ", ";
1402 } else {
1403 wroteOne = true;
1404 }
1405 ret += std::to_string(dimension);
1406 }
1407 ret += ")";
1408 return ret;
1409 };
1410
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1411 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1412 for (unsigned i = 0; i < args.size(); i++) {
1413 const auto& arg = args[i];
1414 std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1415 switch (arg.state()) {
1416 case ModelArgumentInfo::POINTER:
1417 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
1418 << toString(arg.dimensions());
1419 break;
1420 case ModelArgumentInfo::MEMORY:
1421 VLOG(EXECUTION) << prefix << "MEMORY("
1422 << "pool=" << arg.locationAndLength().poolIndex << ", "
1423 << "off=" << arg.locationAndLength().offset << ") dim"
1424 << toString(arg.dimensions());
1425 break;
1426 case ModelArgumentInfo::HAS_NO_VALUE:
1427 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1428 break;
1429 case ModelArgumentInfo::UNSPECIFIED:
1430 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1431 break;
1432 default:
1433 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1434 break;
1435 }
1436 }
1437 }
1438
isCpu() const1439 bool StepExecutor::isCpu() const {
1440 return mDevice == DeviceManager::getCpuDevice();
1441 }
1442
getReusableExecution()1443 std::pair<int, std::shared_ptr<RuntimeExecution>> StepExecutor::getReusableExecution() {
1444 CHECK(mReusable);
1445 if (mExecution == nullptr) {
1446 CHECK(mPreparedModel != nullptr);
1447 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1448 const OptionalDuration loopTimeoutDuration =
1449 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1450 auto [n, execution] = mPreparedModel->createReusableExecution(
1451 mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration);
1452 if (n != ANEURALNETWORKS_NO_ERROR) {
1453 return {n, nullptr};
1454 }
1455 mExecution = std::move(execution);
1456 }
1457 return {ANEURALNETWORKS_NO_ERROR, mExecution};
1458 }
1459
compute(const OptionalTimePoint & deadline,const SharedBurst & burstController)1460 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1461 const OptionalTimePoint& deadline, const SharedBurst& burstController) {
1462 if (VLOG_IS_ON(EXECUTION)) {
1463 logArguments("input", mInputs);
1464 logArguments("output", mOutputs);
1465 }
1466
1467 int n;
1468 std::vector<OutputShape> outputShapes;
1469 Timing timing;
1470 if (mReusable) {
1471 auto [nCreate, execution] = getReusableExecution();
1472 if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1473 return {nCreate, {}, {}};
1474 }
1475 std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline);
1476 } else {
1477 CHECK(mPreparedModel != nullptr);
1478 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1479 const OptionalDuration loopTimeoutDuration =
1480 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1481 std::tie(n, outputShapes, timing) =
1482 mPreparedModel->execute(mInputs, mOutputs, mMemories.getObjects(), burstController,
1483 measure, deadline, loopTimeoutDuration);
1484 }
1485 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1486 return {n, std::move(outputShapes), std::move(timing)};
1487 }
1488
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)1489 std::tuple<int, int, ExecuteFencedInfoCallback> StepExecutor::computeFenced(
1490 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1491 const OptionalTimePoint& deadline) {
1492 if (VLOG_IS_ON(EXECUTION)) {
1493 logArguments("input", mInputs);
1494 logArguments("output", mOutputs);
1495 }
1496
1497 OptionalDuration optionalTimeoutDurationAfterFence;
1498 if (timeoutDurationAfterFence > 0) {
1499 optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence);
1500 }
1501
1502 int n;
1503 int syncFenceFd;
1504 ExecuteFencedInfoCallback executeFencedInfoCallback;
1505 Timing timing;
1506 if (mReusable) {
1507 auto [nCreate, execution] = getReusableExecution();
1508 if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1509 return {nCreate, -1, nullptr};
1510 }
1511 std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) =
1512 execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence);
1513 } else {
1514 CHECK(mPreparedModel != nullptr);
1515 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1516 const OptionalDuration loopTimeoutDuration =
1517 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1518 std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced(
1519 mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1520 loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1521 }
1522 if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) {
1523 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1524 }
1525 return {n, syncFenceFd, executeFencedInfoCallback};
1526 }
1527
1528 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1529 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1530 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1531 VLOG(EXECUTION) << "Re-compile the model on CPU";
1532 const ModelFactory makeModel = [this] { return mModel->makeModel(); };
1533 // TODO: Propagate user preference and compilation priority to this point instead of using
1534 // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1535 // ANEURALNETWORKS_PRIORITY_MEDIUM
1536 const ExecutionPreference preference =
1537 static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1538 const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1539 auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel(makeModel, preference,
1540 priority, {}, {}, {});
1541 if (n != ANEURALNETWORKS_NO_ERROR) {
1542 return {n, {}, {}};
1543 }
1544
1545 // Prepare device memories for CPU fallback.
1546 std::vector<const RuntimeMemory*> memories = mMemories.getObjects();
1547 std::vector<bool> isUsedAsInput(memories.size(), false);
1548 std::vector<bool> isUsedAsOutput(memories.size(), false);
1549 std::vector<std::unique_ptr<RuntimeMemory>> blobAhwbs;
1550
1551 // Mark the input and output usages.
1552 for (auto& input : mInputs) {
1553 if (input.state() == ModelArgumentInfo::MEMORY) {
1554 const uint32_t poolIndex = input.locationAndLength().poolIndex;
1555 isUsedAsInput[poolIndex] = true;
1556 }
1557 }
1558 for (auto& output : mOutputs) {
1559 if (output.state() == ModelArgumentInfo::MEMORY) {
1560 const uint32_t poolIndex = output.locationAndLength().poolIndex;
1561 // Cannot allocate output buffers with unknown shapes.
1562 if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1563 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1564 "has unknown shape.";
1565 return {ANEURALNETWORKS_OP_FAILED, {}, {}};
1566 }
1567 isUsedAsOutput[poolIndex] = true;
1568 }
1569 }
1570
1571 // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1572 for (uint32_t i = 0; i < memories.size(); i++) {
1573 const RuntimeMemory* memory = mMemories[i];
1574 if (memory->getIBuffer() != nullptr) {
1575 const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1576 auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1577 if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1578 return {nAhwb, {}, {}};
1579 }
1580 if (isUsedAsInput[i]) {
1581 n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory());
1582 if (n != ANEURALNETWORKS_NO_ERROR) {
1583 return {n, {}, {}};
1584 }
1585 }
1586 memories[i] = blobAhwb.get();
1587 blobAhwbs.push_back(std::move(blobAhwb));
1588 }
1589 }
1590
1591 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1592 const OptionalDuration loopTimeoutDuration =
1593 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1594 auto [nExecute, outputShapes, timing] = preparedModel->execute(
1595 mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration);
1596 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1597 if (nExecute != ANEURALNETWORKS_NO_ERROR) {
1598 return {nExecute, std::move(outputShapes), timing};
1599 }
1600
1601 // Write back to output device memories.
1602 for (uint32_t i = 0; i < memories.size(); i++) {
1603 const RuntimeMemory* memory = mMemories[i];
1604 if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1605 n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {});
1606 if (n != ANEURALNETWORKS_NO_ERROR) {
1607 return {n, {}, {}};
1608 }
1609 }
1610 }
1611 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1612 }
1613
1614 } // namespace nn
1615 } // namespace android
1616