• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionBuilder"
18 
19 #include "ExecutionBuilder.h"
20 
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <LegacyUtils.h>
24 #include <Tracing.h>
25 #include <android-base/logging.h>
26 #include <nnapi/IBurst.h>
27 #include <nnapi/IPreparedModel.h>
28 #include <nnapi/Types.h>
29 
30 #include <algorithm>
31 #include <limits>
32 #include <map>
33 #include <memory>
34 #include <mutex>
35 #include <optional>
36 #include <string>
37 #include <thread>
38 #include <tuple>
39 #include <utility>
40 #include <vector>
41 
42 #include "BurstBuilder.h"
43 #include "CompilationBuilder.h"
44 #include "Manager.h"
45 #include "ModelArgumentInfo.h"
46 #include "ModelBuilder.h"
47 #include "TypeManager.h"
48 
49 namespace android {
50 namespace nn {
51 
52 // Partial validation of output shapes returned from driver, to ensure they
53 // conform to a very specific set of rules.
validateOutputShapesFromDriver(ErrorStatus executionStatus,const ModelBuilder * model,const std::vector<OutputShape> & shapes)54 static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
55                                            const std::vector<OutputShape>& shapes) {
56     // Enforces the following rules (some of which are from b/154054474):
57     // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
58     //   If the vector is not empty, it must have as many entries as the step model has outputs.
59     // - If NONE, then either shapes vector is empty, or every shape is
60     //   marked isSufficient and, if a tensor, has known rank.
61     // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty.  At least one entry
62     //   is marked !isSufficient.
63     switch (executionStatus) {
64         case ErrorStatus::NONE: {
65             NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
66                     << "With execution ErrorStatus " << executionStatus
67                     << " output shapes vector must be empty or of length " << model->outputCount()
68                     << " but has length " << shapes.size();
69             NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
70                                      [](const OutputShape& shape) { return shape.isSufficient; }))
71                     << "With execution ErrorStatus " << executionStatus
72                     << " at least one output shape is unexpectedly marked !isSufficient";
73 
74             const TypeManager* tm = TypeManager::get();
75             for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
76                  ++outputIndex) {
77                 const Operand& outputOperand = model->getOutputOperand(outputIndex);
78                 NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
79                              (shapes[outputIndex].dimensions.size() != 0))
80                         << "With execution ErrorStatus " << executionStatus << " output#"
81                         << outputIndex << " shape unexpectedly has zero rank";
82             }
83 
84             break;
85         }
86         case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
87             NN_RET_CHECK(shapes.size() == model->outputCount())
88                     << "With execution ErrorStatus " << executionStatus
89                     << " output shapes vector must be of length " << model->outputCount()
90                     << " but has length " << shapes.size();
91             NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
92                                      [](const OutputShape& shape) { return !shape.isSufficient; }))
93                     << "With execution ErrorStatus " << executionStatus
94                     << " at least one output shape must have been marked !isSufficient";
95             break;
96         }
97         default: {
98             NN_RET_CHECK(shapes.size() == 0)
99                     << "With execution ErrorStatus " << executionStatus
100                     << " output shapes vector must be empty but has length " << shapes.size();
101             break;
102         }
103     }
104     return true;
105 }
validateOutputShapesFromDriver(int executionResultCode,const ModelBuilder * model,const std::vector<OutputShape> & shapes)106 static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
107                                            const std::vector<OutputShape>& shapes) {
108     return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
109                                           model, shapes);
110 }
111 
measureTiming(const ExecutionBuilder * execution)112 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
113     return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
114 }
115 
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)116 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
117                                const char* tag, bool allowUnspecified) {
118     if (newType != nullptr) {
119         const Extension::OperandTypeInformation* info = nullptr;
120         if (isExtension(operand.type)) {
121             NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
122         }
123         if (validateOperandType(*newType, info, tag, allowUnspecified) !=
124             ANEURALNETWORKS_NO_ERROR) {
125             LOG(ERROR) << tag << ": Invalid newType";
126             return false;
127         }
128         if (operand.dimensions.size() == 0) {
129             return true;
130         }
131         if (operand.dimensions.size() != newType->dimensionCount) {
132             LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = "
133                        << operand.dimensions.size() << ", new = " << newType->dimensionCount << ")";
134             return false;
135         }
136         for (uint32_t i = 0; i < newType->dimensionCount; i++) {
137             if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
138                 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
139                 return false;
140             }
141         }
142     } else {
143         if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
144             tensorHasUnspecifiedDimensions(operand)) {
145             LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
146             return false;
147         }
148     }
149     return true;
150 }
151 
ExecutionBuilder(const CompilationBuilder * compilation)152 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
153     : mCompilation(compilation),
154       mModel(compilation->mModel),
155       mPlan(&compilation->mPlan),
156       mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
157       mInputs(mModel->inputCount()),
158       mOutputs(mModel->outputCount()) {
159     VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
160                     << " inputs and " << mOutputs.size() << " outputs";
161 }
162 
SimpleExecutionBuilder(const CompilationBuilder * compilation)163 SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
164     : ExecutionBuilder(compilation) {
165     CHECK(mPlan->isSimple());
166 }
167 
CompoundExecutionBuilder(const CompilationBuilder * compilation)168 CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
169     : ExecutionBuilder(compilation) {
170     CHECK(mPlan->isCompound());
171 }
172 
getSourceModel(uint32_t index) const173 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
174     return mPlan->getSourceModels().getModel(index);
175 }
176 
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)177 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
178                                const void* buffer, size_t length) {
179     if (computationStarted()) {
180         LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
181                       "execution has started.";
182         return ANEURALNETWORKS_BAD_STATE;
183     }
184     uint32_t count = static_cast<uint32_t>(mInputs.size());
185     if (index >= count) {
186         LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
187         return ANEURALNETWORKS_BAD_DATA;
188     }
189     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
190                             "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
191         return ANEURALNETWORKS_BAD_DATA;
192     }
193     if (length > 0xFFFFFFFF) {
194         LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
195         return ANEURALNETWORKS_BAD_DATA;
196     }
197     uint32_t l = static_cast<uint32_t>(length);
198     if (!mInputs[index].unspecified()) {
199         LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
200                       "provided";
201         return ANEURALNETWORKS_BAD_STATE;
202     }
203     int n;
204     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
205             mModel->getInputOperand(index), type, const_cast<void*>(buffer), l,
206             mInputAndOutputPaddingEnabled);
207     mHasCalledSetInputOutput = true;
208     return n;
209 }
210 
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)211 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
212                                          const RuntimeMemory* memory, size_t offset,
213                                          size_t length) {
214     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
215 
216     if (computationStarted()) {
217         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
218                       "execution has started.";
219         return ANEURALNETWORKS_BAD_STATE;
220     }
221     uint32_t count = static_cast<uint32_t>(mInputs.size());
222     if (index >= count) {
223         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
224                    << count;
225         return ANEURALNETWORKS_BAD_DATA;
226     }
227     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
228                             "ANeuralNetworksExecution_setInputFromMemory", false)) {
229         return ANEURALNETWORKS_BAD_DATA;
230     }
231     if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
232                                          length)) {
233         return ANEURALNETWORKS_BAD_DATA;
234     }
235     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
236     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
237     // region is used. We update the length here because the drivers are still expecting a real
238     // length. For other memories that do not allow this semantic, it is checked in
239     // MemoryValidatorBase::validate before reaching here.
240     if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
241         length = memory->getSize();
242     }
243     // TODO validate the rest
244     uint32_t poolIndex = mMemories.add(memory);
245     if (!mInputs[index].unspecified()) {
246         LOG(ERROR)
247                 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
248                    "been provided";
249         return ANEURALNETWORKS_BAD_STATE;
250     }
251     int n;
252     std::tie(n, mInputs[index]) =
253             ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex,
254                                                 offset, length, mInputAndOutputPaddingEnabled);
255     mHasCalledSetInputOutput = true;
256     return n;
257 }
258 
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)259 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
260                                 void* buffer, size_t length) {
261     if (computationStarted()) {
262         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
263                       "execution has started.";
264         return ANEURALNETWORKS_BAD_STATE;
265     }
266     uint32_t count = static_cast<uint32_t>(mOutputs.size());
267     if (index >= count) {
268         LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
269         return ANEURALNETWORKS_BAD_DATA;
270     }
271     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
272                             "ANeuralNetworksExecution_setOutput", true)) {
273         return ANEURALNETWORKS_BAD_DATA;
274     }
275     if (length > 0xFFFFFFFF) {
276         LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
277         return ANEURALNETWORKS_BAD_DATA;
278     }
279     uint32_t l = static_cast<uint32_t>(length);
280     if (!mOutputs[index].unspecified()) {
281         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
282                       "provided";
283         return ANEURALNETWORKS_BAD_STATE;
284     }
285     int n;
286     std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer(
287             mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled);
288     mHasCalledSetInputOutput = true;
289     return n;
290 }
291 
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)292 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
293                                           const RuntimeMemory* memory, size_t offset,
294                                           size_t length) {
295     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
296 
297     if (computationStarted()) {
298         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
299                       "execution has started.";
300         return ANEURALNETWORKS_BAD_STATE;
301     }
302     uint32_t count = static_cast<uint32_t>(mOutputs.size());
303     if (index >= count) {
304         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
305                    << count;
306         return ANEURALNETWORKS_BAD_DATA;
307     }
308     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
309                             "ANeuralNetworksExecution_setOutputFromMemory", true)) {
310         return ANEURALNETWORKS_BAD_DATA;
311     }
312     if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
313                                          length)) {
314         return ANEURALNETWORKS_BAD_DATA;
315     }
316     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
317     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
318     // region is used. We update the length here because the drivers are still expecting a real
319     // length. For other memories that do not allow this semantic, it is checked in
320     // MemoryValidatorBase::validate before reaching here.
321     if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
322         length = memory->getSize();
323     }
324     // TODO validate the rest
325     uint32_t poolIndex = mMemories.add(memory);
326     if (!mOutputs[index].unspecified()) {
327         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
328                       "already been provided";
329         return ANEURALNETWORKS_BAD_STATE;
330     }
331     int n;
332     std::tie(n, mOutputs[index]) =
333             ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex,
334                                                 offset, length, mInputAndOutputPaddingEnabled);
335     mHasCalledSetInputOutput = true;
336     return n;
337 }
338 
setMeasureTiming(bool measure)339 int ExecutionBuilder::setMeasureTiming(bool measure) {
340     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
341         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
342                    << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
343                    << "that was not created by ANeuralNetworksCompilation_createForDevices "
344                    << "with numDevices = 1";
345         return ANEURALNETWORKS_BAD_DATA;
346     }
347     if (computationStarted()) {
348         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
349                       "execution has started.";
350         return ANEURALNETWORKS_BAD_STATE;
351     }
352     mMeasureTiming = measure;
353     return ANEURALNETWORKS_NO_ERROR;
354 }
355 
getDuration(int32_t durationCode,uint64_t * duration) const356 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
357     if (!completed()) {
358         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
359                       "execution has finished.";
360         *duration = UINT64_MAX;
361         return ANEURALNETWORKS_BAD_STATE;
362     }
363     if (completedWith() != Completion::NO_ERROR) {
364         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
365                       "that has encountered an error.";
366         *duration = UINT64_MAX;
367         return ANEURALNETWORKS_BAD_STATE;
368     }
369 
370     if (!mMeasureTiming) {
371         *duration = UINT64_MAX;
372         return ANEURALNETWORKS_BAD_STATE;
373     }
374 
375     Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
376     Timing timingFenced = timingLaunched;
377     if (mFencedExecutionCallback != nullptr) {
378         auto result = mFencedExecutionCallback();
379         if (!result.has_value()) {
380             LOG(ERROR) << "Fenced execution callback failed: " << result.error().message;
381             *duration = UINT64_MAX;
382             return ANEURALNETWORKS_BAD_STATE;
383         }
384         std::tie(timingLaunched, timingFenced) = std::move(result).value();
385     }
386     const OptionalDuration selectedDuration = [durationCode, &timingLaunched,
387                                                &timingFenced]() -> OptionalDuration {
388         switch (durationCode) {
389             case ANEURALNETWORKS_DURATION_ON_HARDWARE:
390                 return timingLaunched.timeOnDevice;
391             case ANEURALNETWORKS_DURATION_IN_DRIVER:
392                 return timingLaunched.timeInDriver;
393             case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
394                 return timingFenced.timeOnDevice;
395             case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
396                 return timingFenced.timeInDriver;
397             default:
398                 LOG(FATAL) << "unexpected";
399                 return std::nullopt;
400         }
401     }();
402     if (selectedDuration.has_value()) {
403         constexpr uint64_t kMaxTiming = std::numeric_limits<uint64_t>::max() - 1;
404         using CommonType = std::common_type_t<Duration::rep, uint64_t>;
405         const auto count = std::min<CommonType>(selectedDuration.value().count(), kMaxTiming);
406         *duration = static_cast<uint64_t>(count);
407     } else {
408         constexpr uint64_t kNoTiming = std::numeric_limits<uint64_t>::max();
409         *duration = kNoTiming;
410     }
411 
412     VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
413     return ANEURALNETWORKS_NO_ERROR;
414 }
415 
setTimeoutDuration(uint64_t duration)416 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
417     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
418         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
419                       "created from an ANeuralNetworksCompilation that was not created by "
420                       "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
421         return ANEURALNETWORKS_BAD_DATA;
422     }
423     if (computationStarted()) {
424         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
425         return ANEURALNETWORKS_BAD_STATE;
426     }
427     if (duration > 0) {
428         mTimeoutDuration = duration;
429     } else {
430         mTimeoutDuration.reset();
431     }
432     return ANEURALNETWORKS_NO_ERROR;
433 }
434 
getTimeoutDuration() const435 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
436     return mTimeoutDuration;
437 }
438 
setLoopTimeout(uint64_t duration)439 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
440     if (computationStarted()) {
441         LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
442                       "execution has started.";
443         return ANEURALNETWORKS_BAD_STATE;
444     }
445     if (duration > operation_while::kTimeoutNsMaximum) {
446         LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
447                      << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
448         duration = operation_while::kTimeoutNsMaximum;
449     }
450     mLoopTimeoutDuration = duration;
451     return ANEURALNETWORKS_NO_ERROR;
452 }
453 
enableInputAndOutputPadding(bool enable)454 int ExecutionBuilder::enableInputAndOutputPadding(bool enable) {
455     if (computationStarted()) {
456         LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the "
457                       "execution has started.";
458         return ANEURALNETWORKS_BAD_STATE;
459     }
460     if (mHasCalledSetInputOutput) {
461         LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input "
462                       "or output is set.";
463         return ANEURALNETWORKS_BAD_STATE;
464     }
465     mInputAndOutputPaddingEnabled = enable;
466     return ANEURALNETWORKS_NO_ERROR;
467 }
468 
setReusable(bool reusable)469 int ExecutionBuilder::setReusable(bool reusable) {
470     if (computationStarted()) {
471         LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the "
472                       "execution has started.";
473         return ANEURALNETWORKS_BAD_STATE;
474     }
475     mReusable = reusable;
476     return ANEURALNETWORKS_NO_ERROR;
477 }
478 
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)479 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
480     if (!completed()) {
481         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
482                       "execution has finished.";
483         return ANEURALNETWORKS_BAD_STATE;
484     }
485     if (completedWith() == Completion::OTHER_ERROR) {
486         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
487                       "that has encountered an error.";
488         return ANEURALNETWORKS_BAD_STATE;
489     }
490 
491     uint32_t count = static_cast<uint32_t>(mOutputs.size());
492     if (index >= count) {
493         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
494                    << " " << count;
495         return ANEURALNETWORKS_BAD_DATA;
496     }
497     const auto& dims = mOutputs[index].dimensions();
498     if (dims.empty()) {
499         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
500                       "dimensions of a scalar";
501         return ANEURALNETWORKS_BAD_DATA;
502     }
503     std::copy(dims.begin(), dims.end(), dimensions);
504     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
505                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
506 }
507 
getOutputOperandRank(uint32_t index,uint32_t * rank)508 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
509     if (!completed()) {
510         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
511                       "execution has finished.";
512         return ANEURALNETWORKS_BAD_STATE;
513     }
514     if (completedWith() == Completion::OTHER_ERROR) {
515         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
516                       "that has encountered an error.";
517         return ANEURALNETWORKS_BAD_STATE;
518     }
519     uint32_t count = static_cast<uint32_t>(mOutputs.size());
520     if (index >= count) {
521         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
522                    << count;
523         return ANEURALNETWORKS_BAD_DATA;
524     }
525     *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
526     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
527                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
528 }
529 
checkAndSetComputationState(const char * name)530 bool ExecutionBuilder::checkAndSetComputationState(const char* name) {
531     std::lock_guard<std::mutex> lock(mStateMutex);
532     if (!mReusable && mState == State::COMPLETED) {
533         LOG(ERROR) << "ANeuralNetworksExecution_" << name
534                    << " called on a non-reusable execution that has already completed";
535         return false;
536     }
537     if (mState == State::COMPUTATION) {
538         LOG(ERROR) << "ANeuralNetworksExecution_" << name
539                    << " called on an execution that has already started";
540         return false;
541     }
542     mState = State::COMPUTATION;
543     return true;
544 }
545 
546 // TODO(b/132321855): validate that we have full types for all inputs and outputs,
547 // that the graph is not cyclic,
validateRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs)548 static int validateRequest(const std::vector<ModelArgumentInfo>& inputs,
549                            const std::vector<ModelArgumentInfo>& outputs) {
550     for (auto& p : inputs) {
551         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
552             LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified";
553             return ANEURALNETWORKS_BAD_DATA;
554         }
555     }
556     for (auto& p : outputs) {
557         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
558             LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified";
559             return ANEURALNETWORKS_BAD_DATA;
560         }
561     }
562     return ANEURALNETWORKS_NO_ERROR;
563 }
564 
getValidationResultCode()565 int ExecutionBuilder::getValidationResultCode() {
566     if (!mValidationResultCode.has_value()) {
567         mValidationResultCode = validateRequest(mInputs, mOutputs);
568     }
569     return mValidationResultCode.value();
570 }
571 
areOutputsFullySpecified()572 bool ExecutionBuilder::areOutputsFullySpecified() {
573     if (!mOutputsFullySpecified.has_value()) {
574         mOutputsFullySpecified = true;
575         for (uint32_t i = 0; i < mOutputs.size(); i++) {
576             if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
577                 TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) &&
578                 tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type,
579                                                mOutputs[i].initialDimensions())) {
580                 mOutputsFullySpecified = false;
581                 break;
582             }
583         }
584     }
585     return mOutputsFullySpecified.value();
586 }
587 
prepareForCompute(const char * name)588 int ExecutionBuilder::prepareForCompute(const char* name) {
589     if (!checkAndSetComputationState(name)) {
590         return ANEURALNETWORKS_BAD_STATE;
591     }
592     if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) {
593         return finishComputation(n, {});
594     }
595     return ANEURALNETWORKS_NO_ERROR;
596 }
597 
598 // Attempt synchronous execution of full model on CPU.
599 // TODO: How should we handle timing in this case?
600 //       For Q this is irrelevant: We only support timing in conjunction
601 //         with an explicit device list; and we do not support CPU fallback
602 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)603 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
604         ExecutionBuilder* executionBuilder) {
605     CHECK(executionBuilder != nullptr);
606     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
607     VLOG(EXECUTION) << "cpuFallbackFull";
608 
609     // Get fallback executor.
610     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
611                           DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr,
612                           /*reusable=*/false);
613     executor.mapInputsAndOutputsTrivially();
614 
615     // Attempt fallback execution.
616     return executor.computeOnCpuFallback();
617 }
618 
619 // Attempt synchronous execution on CPU.
620 // TODO: How should we handle timing in this case?
621 //       For Q this is irrelevant: We only support timing in conjunction
622 //         with an explicit device list; and we do not support CPU fallback
623 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
624 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)625 cpuFallbackPartial(const ExecutionPlan& plan,
626                    std::shared_ptr<ExecutionPlan::Controller> controller) {
627     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
628     VLOG(EXECUTION) << "cpuFallbackPartial";
629 
630     // Get fallback executor.
631     std::shared_ptr<StepExecutor> executor;
632     int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
633     if (n1 != ANEURALNETWORKS_NO_ERROR) {
634         return {n1, {}, {}, nullptr};
635     }
636     CHECK(executor != nullptr);
637 
638     // Attempt fallback execution.
639     auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
640     return {n2, std::move(outputShapes), timing, executor};
641 }
642 
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)643 std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
644         const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
645     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
646     VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
647 
648     if (mExecutor == nullptr) {
649         mExecutor = mPlan->makeStepExecutor(mReusable, this);
650     }
651 
652     auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
653     auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController);
654 
655     if (n == ANEURALNETWORKS_NO_ERROR) {
656         return {n, std::move(outputShapes), timing};
657     }
658 
659     // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
660     if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
661         return {n, std::move(outputShapes), {}};
662     }
663 
664     // If CPU fallback is not allowed and there was an error, end execution.
665     if (!mAllowCpuFallback) {
666         return {n, {}, {}};
667     }
668 
669     // If CPU execution was already attempted, do not perform CPU fallback.
670     if (mExecutor->isCpu()) {
671         return {n, {}, {}};
672     }
673 
674     // If the code has reached this point, a potentially recoverable error
675     // occurred during the execution. Do an execution fallback on the CPU.
676     return cpuFallbackFull(this);
677 }
678 
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)679 std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
680         const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
681     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
682     VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
683 
684     auto controller = mPlan->makeController(this, burstBuilder);
685     std::vector<OutputShape> outputShapes = getInitialOutputShapes();
686 
687     // On this iteration, do I need to repeat the previous step because it
688     // reported insufficient size?
689     bool doInsufficientSizeFallback = false;
690 
691     while (true) {
692         VLOG(EXECUTION) << "looking for next StepExecutor";
693 
694         // Get the current step of the execution.
695         std::shared_ptr<StepExecutor> executor;
696         SharedBurst burstController;
697         int n = doInsufficientSizeFallback
698                         ? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
699                         : mPlan->next(controller, &executor, &burstController, &outputShapes);
700         doInsufficientSizeFallback = false;
701         if (n != ANEURALNETWORKS_NO_ERROR) {
702             // During the interpreted execution of control flow, a loop timeout
703             // might occur in ExecutionPlan::next().
704             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
705                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
706             if (mAllowCpuFallback && !missedDeadline) break;
707             return {n, {}, {}};
708         }
709 
710         // If the code reached the end of the plan without error, then return
711         // with no error.
712         if (executor == nullptr) {
713             return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
714         }
715         const bool executorIsCpu = executor->isCpu();
716 
717         // Attempt to execute a single step of the execution.
718         auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
719 
720         // Update global outputs and dynamic temporaries.
721         StepExecutor::UpdateOutputShapes updateOutputShapes = {};
722         if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
723                                           &updateOutputShapes)) {
724             stepN = ANEURALNETWORKS_OP_FAILED;
725         }
726 
727         // If execution was successful, continue to next step.
728         if (stepN == ANEURALNETWORKS_NO_ERROR) {
729             if (updateOutputShapes.zeroSizedInput) {
730                 // We'll need to do full model CPU fallback
731                 VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
732                 stepN = ANEURALNETWORKS_OP_FAILED;
733             } else {
734                 CHECK(executor->areDynamicTemporariesAllocated());
735                 continue;
736             }
737         }
738 
739         if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
740             VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
741             if (updateOutputShapes.mainOutputInsufficient ||
742                 !updateOutputShapes.updatedDynamicTemporary) {
743                 // Either:
744                 // - At least one main model output is not of sufficient size; or
745                 // - we didn't learn anything new about dynamic temporaries.
746                 // Neither of these is recoverable, so end execution.
747                 return {stepN, outputShapes, {}};
748             }
749             // Every main model output is of sufficient size.  This implies that
750             // at least one dynamic temporary is not of sufficient size.  This
751             // is recoverable.
752             doInsufficientSizeFallback = true;
753             continue;
754         }
755 
756         // If CPU fallback is not allowed and there was an error, end execution.
757         if (!mAllowCpuFallback) {
758             return {stepN, {}, {}};
759         }
760 
761         // If CPU execution was already attempted, perform a full CPU fallback.
762         if (executorIsCpu) {
763             break;
764         }
765 
766         // If the code reaches this point, attempt a partial fallback to CPU.
767         CHECK(mAllowCpuFallback);
768         if (updateOutputShapes.zeroSizedInput) {
769             // Do not attempt a partial fallback.
770             break;
771         }
772         while (true) {
773             auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
774                     cpuFallbackPartial(*mPlan, controller);
775 
776             // Update global outputs and dynamic temporaries.
777             StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
778             if (fallbackExecutor != nullptr &&
779                 !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
780                                                       &outputShapes, &fallbackUpdateOutputShapes)) {
781                 fallbackN = ANEURALNETWORKS_OP_FAILED;
782             }
783 
784             // If execution was successful, continue to next step.
785             if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
786                 if (fallbackUpdateOutputShapes.zeroSizedInput) {
787                     // We'll need to do full model CPU fallback
788                     VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
789                     fallbackN = ANEURALNETWORKS_OP_FAILED;
790                     break;
791                 }
792                 CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
793                 goto nextStep;
794             }
795 
796             if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
797                 VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
798                                 << toString(fallbackUpdateOutputShapes);
799                 if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
800                     !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
801                     // Either:
802                     // - At least one main model output is not of sufficient size; or
803                     // - we didn't learn anything new about dynamic temporaries.
804                     // Neither of these is recoverable, so end execution.
805                     return {fallbackN, outputShapes, {}};
806                 }
807                 // Every main model output is of sufficient size.  This implies
808                 // that at least one dynamic temporary is not of sufficient
809                 // size.  This is recoverable.
810                 continue;
811             }
812 
813             // If the code reaches this point, then there was an error with the
814             // fallback. In this case, attempt full fallback.
815             break;
816         }
817 
818         // If the code reaches this point, then there was an error with the
819         // fallback. In this case, attempt full fallback.
820         break;
821 
822     nextStep:
823         // Bottom of the outer loop
824         continue;
825     }
826 
827     // If the code has reached this point, a potentially recoverable error
828     // occurred during the step executions. Instead, do a full execution
829     // fallback on the CPU.
830     return cpuFallbackFull(this);
831 }
832 
waitForSyncFences(const std::vector<int> & waitFor)833 static bool waitForSyncFences(const std::vector<int>& waitFor) {
834     for (int syncFd : waitFor) {
835         if (syncFd > 0) {
836             auto r = syncWait(syncFd, -1);
837             if (r != FenceState::SIGNALED) {
838                 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
839                 return false;
840             }
841         }
842     }
843     return true;
844 }
845 
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)846 std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
847         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
848         const OptionalTimePoint& deadline) {
849     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
850     VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
851 
852     if (mExecutor == nullptr) {
853         mExecutor = mPlan->makeStepExecutor(mReusable, this);
854     }
855 
856     auto [n, syncFd, callback] =
857             mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
858 
859     if (n == ANEURALNETWORKS_NO_ERROR) {
860         return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
861     }
862 
863     // If CPU fallback is not allowed and there was an error, end execution.
864     if (!mAllowCpuFallback) {
865         return {n, -1, nullptr};
866     }
867 
868     // If CPU execution was already attempted, return from the function with an error.
869     if (mExecutor->isCpu()) {
870         return {n, -1, nullptr};
871     }
872 
873     // If the code has reached this point, a potentially recoverable error
874     // occurred during the step executions. Instead, do a full execution
875     // fallback on the CPU.
876     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
877     if (!waitForSyncFences(waitFor)) {
878         return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
879     }
880     auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
881     reportTimingWithoutFencedExecutionCallback(fallbackTiming);
882     return {fallbackN, -1, nullptr};
883 }
884 
885 // In case of partitioned execution, computeFencedInternal call will return the sync
886 // fence and the fenced compute callback returned from the last partition.
887 // Any failed partition will result in whole execution fallback to CPU if
888 // mAllowCpuFallback is set to true.
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)889 std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
890         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
891         const OptionalTimePoint& deadline) {
892     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
893     VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
894 
895     // We should have detected this earlier in the call chain and fallen back to
896     // non-fenced execution.  This is an implementation limitation: In order to
897     // support dynamic temporarires in this code, we'd need to implement
898     // something like the following:
899     // - If a partition has outputs of unknown size, compute that partition in a
900     //   non fenced fashion, just as if it were scheduled on a driver that does
901     //   not support fenced execution.
902     // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
903     //   that handles a step execution that fails with
904     //   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
905     CHECK(!mCompilation->hasDynamicTemporaries());
906 
907     // Initiate waitForFds, syncFence for the first step.
908     std::vector<int> waitForFds = waitFor;
909     base::unique_fd syncFence;
910     ExecuteFencedInfoCallback executeFencedInfoCallback;
911 
912     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
913     while (true) {
914         VLOG(EXECUTION) << "looking for next StepExecutor";
915 
916         // Get the current step of the execution.
917         std::shared_ptr<StepExecutor> executor;
918         int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get());
919         if (n != ANEURALNETWORKS_NO_ERROR) {
920             // During the interpreted execution of control flow, a loop timeout
921             // might occur in ExecutionPlan::next().
922             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
923                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
924             if (mAllowCpuFallback && !missedDeadline) break;
925             // Return -1 for the sync fence fd, and nullptr for the callback.
926             return {n, -1, nullptr};
927         }
928 
929         // If the code reached the end of the plan without error, then return
930         // with no error.
931         if (executor == nullptr) {
932             return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback};
933         }
934 
935         // Attempt to compute a single step of the execution.
936         auto [stepN, syncFd, callback] =
937                 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
938 
939         // Update waitForFds, syncFence for the next step.
940         syncFence.reset(syncFd);
941         executeFencedInfoCallback = callback;
942         waitForFds.clear();
943         if (syncFd >= 0) {
944             waitForFds = {syncFd};
945         }
946 
947         // If execution was successful, continue to next step.
948         if (stepN == ANEURALNETWORKS_NO_ERROR) {
949             continue;
950         }
951         // If CPU fallback is not allowed and there was an error, end execution.
952         if (!mAllowCpuFallback) {
953             return {stepN, -1, nullptr};
954         }
955 
956         // If the code reaches this point, then there was an error with the
957         // fallback. In this case, attempt full fallback.
958         break;
959     }
960 
961     // If the code has reached this point, a potentially recoverable error
962     // occurred during the step executions. Instead, do a full execution
963     // fallback on the CPU.
964     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
965     if (!waitForSyncFences(waitFor)) {
966         return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
967     }
968     auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
969     return {fullN, -1, nullptr};
970 }
971 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)972 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
973                                     uint64_t timeoutDurationAfterFence, int* syncFence) {
974     CHECK(syncFence != nullptr);
975     NN_RETURN_IF_ERROR(prepareForCompute("startComputeWithDependencies"));
976     if (timeoutDurationAfterFence > 0) {
977         if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
978             LOG(ERROR)
979                     << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
980                        "duration on an ANeuralNetworksExecution "
981                        "created from an ANeuralNetworksCompilation that was not created by "
982                        "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
983             return finishComputation(ANEURALNETWORKS_BAD_DATA, {});
984         }
985     }
986     if (!areOutputsFullySpecified()) {
987         LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
988                       " not all outputs have fully specified dimensions";
989         return finishComputation(ANEURALNETWORKS_BAD_DATA, {});
990     }
991 
992     // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
993     // fenced executions do not support dynamic output shape.
994 
995     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
996     int result;
997     const auto deadline = makeDeadline(mTimeoutDuration);
998     std::tie(result, *syncFence, mFencedExecutionCallback) =
999             computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
1000     // If there is an error, call finishComputation to mark the computation as completed.
1001     // Otherwise, we will call finishComputation in SyncFenceEvent::wait().
1002     if (result != ANEURALNETWORKS_NO_ERROR) {
1003         // TODO(miaowang): support dynamic output shape only with memory domain.
1004         // For now just return empty output shapes.
1005         result = finishComputation(result, {});
1006     }
1007     return result;
1008 }
1009 
compute(std::shared_ptr<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)1010 int ExecutionBuilder::compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
1011                               BurstBuilder* burstBuilder) {
1012     CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
1013             << "synchronizationCallback and burstBuilder cannot simultaneously be used";
1014 
1015     const bool synchronous = (synchronizationCallback == nullptr);
1016     if (!synchronous) {
1017         *synchronizationCallback = nullptr;
1018     }
1019 
1020     const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
1021     NN_RETURN_IF_ERROR(prepareForCompute(name));
1022 
1023     // Validate input memory dimensions. We need to do the validation in every computation because
1024     // the memory dimensions may change between computations.
1025     for (auto& p : mInputs) {
1026         if (p.state() == ModelArgumentInfo::MEMORY) {
1027             const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex];
1028             if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
1029                 return finishComputation(ANEURALNETWORKS_OP_FAILED, {});
1030             }
1031         }
1032     }
1033 
1034     // Reset output dimensions.
1035     if (!areOutputsFullySpecified()) {
1036         for (auto& output : mOutputs) {
1037             output.reset();
1038         }
1039     }
1040 
1041     const auto deadline = makeDeadline(mTimeoutDuration);
1042     if (synchronous) {
1043         if (burstBuilder) {
1044             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
1045         } else {
1046             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
1047         }
1048         const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
1049         if (mMeasureTiming) {
1050             mTimingWithoutFencedExecutionCallback = timing;
1051         }
1052         return finishComputation(n, outputShapes);
1053     } else /* asynchronous */ {
1054         // TODO: For asynchronous execution, entire plan-based-path should run in an
1055         // asynchronous thread -- take the asynchronous thread logic out of
1056         // CpuExecution::compute() and use it to wrap the plan-based-path.
1057 
1058         // TODO: use a thread pool
1059         // TODO(mikie): this could have NNTRACE so we could measure the overhead
1060         //              of spinning up a new thread.
1061 
1062         // Prepare the callback for asynchronous execution.
1063         // std::shared_ptr<ExecutionCallback> object is returned when the
1064         // execution has been successfully launched, otherwise a
1065         // nullptr is returned.  The executionCallback is
1066         // abstracted in the NN API as an "event".
1067         auto executionCallback = std::make_shared<ExecutionCallback>();
1068         executionCallback->setOnFinish(
1069                 [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
1070                     return finishComputation(error, outputShapes);
1071                 });
1072         const auto asyncStartCompute = [this, deadline, executionCallback] {
1073             const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
1074             const auto status = convertResultCodeToErrorStatus(n);
1075             executionCallback->notify(status, outputShapes, timing);
1076         };
1077         if (DeviceManager::get()->syncExecRuntime()) {
1078             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
1079             asyncStartCompute();
1080         } else {
1081             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
1082             std::thread asyncExecution(asyncStartCompute);
1083             executionCallback->bindThread(std::move(asyncExecution));
1084         }
1085         *synchronizationCallback = executionCallback;
1086         return ANEURALNETWORKS_NO_ERROR;
1087     }
1088 }
1089 
getInitialOutputShapes() const1090 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
1091     std::vector<OutputShape> outputShapes(mOutputs.size());
1092     std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
1093                    [](const auto& x) -> OutputShape {
1094                        std::vector<uint32_t> dimensions;
1095                        if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
1096                            dimensions = x.dimensions();
1097                        }
1098                        return {.dimensions = std::move(dimensions), .isSufficient = true};
1099                    });
1100     return outputShapes;
1101 }
1102 
1103 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
1104 // have no lower a specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)1105 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
1106     if (to.size() == 0) return true;
1107     NN_RET_CHECK_EQ(to.size(), from.size());
1108     for (uint32_t i = 0; i < to.size(); i++) {
1109         NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
1110     }
1111     return true;
1112 }
1113 
isZeroSizedTensor(int executionResultCode,const OutputShape & outputShape)1114 static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
1115     return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
1116            outputShape.dimensions.size() &&
1117            (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
1118             outputShape.dimensions.end());
1119 }
1120 
updateOutputShapes(ErrorStatus status,const std::vector<OutputShape> & outputShapes)1121 bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
1122                                           const std::vector<OutputShape>& outputShapes) {
1123     NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
1124 
1125     if (outputShapes.size() == 0) {
1126         return true;
1127     }
1128     NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
1129     for (uint32_t i = 0; i < outputShapes.size(); i++) {
1130         // Check if only unspecified dimensions or rank are overwritten.
1131         NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
1132         const OperandType operandType = mModel->getOutputOperand(i).type;
1133         NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
1134                                                                     outputShapes[i].dimensions));
1135     }
1136     for (uint32_t i = 0; i < outputShapes.size(); i++) {
1137         mOutputs[i].dimensions() = outputShapes[i].dimensions;
1138         mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
1139     }
1140     return true;
1141 }
1142 
updateMemories()1143 bool ExecutionBuilder::updateMemories() {
1144     for (const auto& output : mOutputs) {
1145         if (output.state() != ModelArgumentInfo::MEMORY) continue;
1146         const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1147         NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
1148     }
1149     return true;
1150 }
1151 
finishComputation(int result,const std::vector<OutputShape> & outputShapes)1152 int ExecutionBuilder::finishComputation(int result, const std::vector<OutputShape>& outputShapes) {
1153     const auto status = convertResultCodeToErrorStatus(result);
1154     if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
1155         result = ANEURALNETWORKS_OP_FAILED;
1156     }
1157     bool success = result == ANEURALNETWORKS_NO_ERROR;
1158     for (const auto& output : mOutputs) {
1159         if (output.state() != ModelArgumentInfo::MEMORY) continue;
1160         const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1161         memory->getValidator().setInitialized(success);
1162     }
1163     switch (result) {
1164         case ANEURALNETWORKS_NO_ERROR:
1165             mCompletion = Completion::NO_ERROR;
1166             break;
1167         case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
1168             mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE;
1169             break;
1170         default:
1171             mCompletion = Completion::OTHER_ERROR;
1172             break;
1173     }
1174     {
1175         std::lock_guard<std::mutex> lock(mStateMutex);
1176         CHECK(mState != State::PREPARATION)
1177                 << "ExecutionBuilder::finishComputation is called in the preparation state";
1178         CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice";
1179         mState = State::COMPLETED;
1180     }
1181     return result;
1182 }
1183 
toString(StepExecutor::UpdateOutputShapes updateOutputShapes)1184 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
1185     return "{ .updatedDynamicTemporary = " +
1186            std::to_string(updateOutputShapes.updatedDynamicTemporary) +
1187            ", .mainOutputInsufficient = " +
1188            std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
1189 }
1190 
updateOutputShapes(int executionResultCode,const std::vector<OutputShape> & from,std::vector<OutputShape> * to,UpdateOutputShapes * update)1191 bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
1192                                       std::vector<OutputShape>* to, UpdateOutputShapes* update) {
1193     CHECK(update != nullptr);
1194     *update = {.updatedDynamicTemporary = false,
1195                .mainOutputInsufficient = false,
1196                .zeroSizedInput = false};
1197 
1198     NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
1199 
1200     if (from.size() == 0) {
1201         return true;
1202     }
1203 
1204     if (VLOG_IS_ON(EXECUTION)) {
1205         for (const auto& shape : from) {
1206             VLOG(EXECUTION) << "updateOutputShapes: " << shape;
1207         }
1208     }
1209 
1210     if (mExecutionStep != nullptr) {
1211         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
1212         NN_RET_CHECK_LE(indexMapping.size(), from.size());
1213         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
1214             const uint32_t toIndex = indexMapping[i];
1215             NN_RET_CHECK_GT(to->size(), toIndex);
1216             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
1217             (*to)[toIndex] = from[i];
1218             update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
1219             if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
1220                 isZeroSizedTensor(executionResultCode, from[i])) {
1221                 update->zeroSizedInput = true;
1222             }
1223         }
1224 
1225         if (!mDynamicTemporaries->empty()) {
1226             // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
1227             std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
1228             for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
1229                 operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
1230             }
1231 
1232             const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
1233             for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
1234                 const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
1235                 const auto it =
1236                         operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
1237                 if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
1238                     continue;
1239                 }
1240                 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
1241                 VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
1242                                 << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
1243                                 << sourceOperandIndex.second << ") is a dynamic temporary";
1244                 // This is a temporary, but it might not be a dynamic temporary.
1245                 const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
1246                 if (loc == std::nullopt) {
1247                     continue;
1248                 }
1249                 NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
1250                 bool changedShape = false;
1251                 const uint32_t actualSize = TypeManager::get()->getSizeOfData(
1252                         mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
1253                 if (actualSize > 0) {
1254                     changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
1255                                                                   from[i].dimensions, actualSize);
1256                 } else if (!from[i].isSufficient) {
1257                     NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2)
1258                             << "output#" << i << " paddedLength overflow";
1259                     changedShape = mDynamicTemporaries->redeclare(
1260                             sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength);
1261                 } else {
1262                     // The combination of not-fully-specified dimensions
1263                     // and isSufficient means that we have no
1264                     // information about whether the size of the dynamic
1265                     // temporary is adequate.
1266                     VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
1267                     if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
1268                         NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
1269                         // This is a zero-sized tensor, and by
1270                         // definition, any dynamic temporary is an input
1271                         // to an execution step.
1272                         update->zeroSizedInput = true;
1273                     }
1274                 }
1275                 if (changedShape) {
1276                     // TODO: find a better place for this comment.
1277                     //
1278                     // isUpdatable(a, b) imposes a partial ordering a <=
1279                     // b.  Every fully specified dimensions vector is an
1280                     // upper bound of that ordering.  Therefore, any
1281                     // change in dimensions moves towards an upper
1282                     // bound, and hence there are a finite number of
1283                     // such changes possible.
1284                     //
1285                     // actualSize can only be computed from dimensions
1286                     // that are an upper bound.  Therefore, once
1287                     // actualSize is computed, it will not change.
1288                     //
1289                     // If dimensions are not fully specified, and
1290                     // estimated size changes, it increases.  There is
1291                     // an upper bound on estimated size to avoid
1292                     // overflow.
1293                     //
1294                     // Therefore, if we retry only when dimensions or
1295                     // size chage, and we stop retrying if we would
1296                     // otherwise overflow, we should only retry a finite
1297                     // number of times.
1298                     update->updatedDynamicTemporary = true;
1299                 }
1300             }
1301             mDynamicTemporaries->vlogDump("finished updateOutputShapes");
1302         }
1303     } else {
1304         NN_RET_CHECK_EQ(from.size(), to->size());
1305         for (uint32_t i = 0, e = from.size(); i < e; i++) {
1306             NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
1307             (*to)[i] = from[i];
1308         }
1309     }
1310     return true;
1311 }
1312 
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<RuntimePreparedModel> preparedModel,bool reusable,const ExecutionStep * step,DynamicTemporaries * dynamicTemporaries)1313 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
1314                            std::shared_ptr<Device> device,
1315                            std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
1316                            const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries)
1317     : mExecutionBuilder(executionBuilder),
1318       mExecutionStep(step),
1319       mDynamicTemporaries(dynamicTemporaries),
1320       mModel(model),
1321       mDevice(device),
1322       mPreparedModel(preparedModel),
1323       mInputs(model->inputCount()),
1324       mOutputs(model->outputCount()),
1325       mReusable(reusable) {
1326     CHECK(mDevice != nullptr);
1327     CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
1328     CHECK(!(reusable && dynamicTemporaries != nullptr));
1329     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
1330                     << mOutputs.size() << " outputs";
1331 }
1332 
areDynamicTemporariesAllocated() const1333 bool StepExecutor::areDynamicTemporariesAllocated() const {
1334     return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
1335 }
1336 
mapInputsAndOutputsTrivially()1337 void StepExecutor::mapInputsAndOutputsTrivially() {
1338     mInputs = mExecutionBuilder->mInputs;
1339     mOutputs = mExecutionBuilder->mOutputs;
1340     mMemories = mExecutionBuilder->mMemories;
1341 }
1342 
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput,const Dimensions * builderDimensions)1343 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1344                                     ModelArgumentInfo* executorInputOrOutput,
1345                                     const Dimensions* builderDimensions) {
1346     auto updateDimensions = [executorInputOrOutput, builderDimensions] {
1347         if (!builderDimensions) {
1348             return;
1349         }
1350         executorInputOrOutput->dimensions() = *builderDimensions;
1351     };
1352 
1353     *executorInputOrOutput = builderInputOrOutput;
1354     switch (executorInputOrOutput->state()) {
1355         default:
1356             CHECK(false) << "unexpected ModelArgumentInfo::state";
1357             break;
1358         case ModelArgumentInfo::HAS_NO_VALUE:
1359         case ModelArgumentInfo::UNSPECIFIED:
1360             break;
1361         case ModelArgumentInfo::POINTER:
1362             updateDimensions();
1363             break;
1364         case ModelArgumentInfo::MEMORY: {
1365             updateDimensions();
1366             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1367             const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1368             const uint32_t executorPoolIndex = mMemories.add(memory);
1369             executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1370             break;
1371         }
1372     }
1373 }
1374 
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const RuntimeMemory * memory,uint32_t offset,uint32_t length,const Dimensions & dimensions,ModelArgumentInfo * inputOrOutputInfo)1375 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1376                                              const RuntimeMemory* memory, uint32_t offset,
1377                                              uint32_t length, const Dimensions& dimensions,
1378                                              ModelArgumentInfo* inputOrOutputInfo) {
1379     // Should be similar to
1380     //     ExecutionBuilder::setInputFromMemory()
1381     //     ExecutionBuilder::setOutputFromMemory()
1382 
1383     uint32_t poolIndex = mMemories.add(memory);
1384     CHECK(inputOrOutputInfo->unspecified());
1385     int n;
1386     std::tie(n, *inputOrOutputInfo) =
1387             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1388                                                 /*type=*/nullptr, poolIndex, offset, length);
1389     if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
1390         CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
1391         inputOrOutputInfo->dimensions() = dimensions;
1392     }
1393     return n;
1394 }
1395 
toString(std::vector<uint32_t> dimensions)1396 static std::string toString(std::vector<uint32_t> dimensions) {
1397     std::string ret = "(";
1398     bool wroteOne = false;
1399     for (uint32_t dimension : dimensions) {
1400         if (wroteOne) {
1401             ret += ", ";
1402         } else {
1403             wroteOne = true;
1404         }
1405         ret += std::to_string(dimension);
1406     }
1407     ret += ")";
1408     return ret;
1409 };
1410 
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1411 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1412     for (unsigned i = 0; i < args.size(); i++) {
1413         const auto& arg = args[i];
1414         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1415         switch (arg.state()) {
1416             case ModelArgumentInfo::POINTER:
1417                 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
1418                                 << toString(arg.dimensions());
1419                 break;
1420             case ModelArgumentInfo::MEMORY:
1421                 VLOG(EXECUTION) << prefix << "MEMORY("
1422                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
1423                                 << "off=" << arg.locationAndLength().offset << ") dim"
1424                                 << toString(arg.dimensions());
1425                 break;
1426             case ModelArgumentInfo::HAS_NO_VALUE:
1427                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1428                 break;
1429             case ModelArgumentInfo::UNSPECIFIED:
1430                 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1431                 break;
1432             default:
1433                 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1434                 break;
1435         }
1436     }
1437 }
1438 
isCpu() const1439 bool StepExecutor::isCpu() const {
1440     return mDevice == DeviceManager::getCpuDevice();
1441 }
1442 
getReusableExecution()1443 std::pair<int, std::shared_ptr<RuntimeExecution>> StepExecutor::getReusableExecution() {
1444     CHECK(mReusable);
1445     if (mExecution == nullptr) {
1446         CHECK(mPreparedModel != nullptr);
1447         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1448         const OptionalDuration loopTimeoutDuration =
1449                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1450         auto [n, execution] = mPreparedModel->createReusableExecution(
1451                 mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration);
1452         if (n != ANEURALNETWORKS_NO_ERROR) {
1453             return {n, nullptr};
1454         }
1455         mExecution = std::move(execution);
1456     }
1457     return {ANEURALNETWORKS_NO_ERROR, mExecution};
1458 }
1459 
compute(const OptionalTimePoint & deadline,const SharedBurst & burstController)1460 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1461         const OptionalTimePoint& deadline, const SharedBurst& burstController) {
1462     if (VLOG_IS_ON(EXECUTION)) {
1463         logArguments("input", mInputs);
1464         logArguments("output", mOutputs);
1465     }
1466 
1467     int n;
1468     std::vector<OutputShape> outputShapes;
1469     Timing timing;
1470     if (mReusable) {
1471         auto [nCreate, execution] = getReusableExecution();
1472         if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1473             return {nCreate, {}, {}};
1474         }
1475         std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline);
1476     } else {
1477         CHECK(mPreparedModel != nullptr);
1478         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1479         const OptionalDuration loopTimeoutDuration =
1480                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1481         std::tie(n, outputShapes, timing) =
1482                 mPreparedModel->execute(mInputs, mOutputs, mMemories.getObjects(), burstController,
1483                                         measure, deadline, loopTimeoutDuration);
1484     }
1485     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1486     return {n, std::move(outputShapes), std::move(timing)};
1487 }
1488 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)1489 std::tuple<int, int, ExecuteFencedInfoCallback> StepExecutor::computeFenced(
1490         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1491         const OptionalTimePoint& deadline) {
1492     if (VLOG_IS_ON(EXECUTION)) {
1493         logArguments("input", mInputs);
1494         logArguments("output", mOutputs);
1495     }
1496 
1497     OptionalDuration optionalTimeoutDurationAfterFence;
1498     if (timeoutDurationAfterFence > 0) {
1499         optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence);
1500     }
1501 
1502     int n;
1503     int syncFenceFd;
1504     ExecuteFencedInfoCallback executeFencedInfoCallback;
1505     Timing timing;
1506     if (mReusable) {
1507         auto [nCreate, execution] = getReusableExecution();
1508         if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1509             return {nCreate, -1, nullptr};
1510         }
1511         std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) =
1512                 execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence);
1513     } else {
1514         CHECK(mPreparedModel != nullptr);
1515         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1516         const OptionalDuration loopTimeoutDuration =
1517                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1518         std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced(
1519                 mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1520                 loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1521     }
1522     if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) {
1523         mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1524     }
1525     return {n, syncFenceFd, executeFencedInfoCallback};
1526 }
1527 
1528 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1529 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1530     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1531     VLOG(EXECUTION) << "Re-compile the model on CPU";
1532     const ModelFactory makeModel = [this] { return mModel->makeModel(); };
1533     // TODO: Propagate user preference and compilation priority to this point instead of using
1534     // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1535     // ANEURALNETWORKS_PRIORITY_MEDIUM
1536     const ExecutionPreference preference =
1537             static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1538     const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1539     auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel(makeModel, preference,
1540                                                                           priority, {}, {}, {});
1541     if (n != ANEURALNETWORKS_NO_ERROR) {
1542         return {n, {}, {}};
1543     }
1544 
1545     // Prepare device memories for CPU fallback.
1546     std::vector<const RuntimeMemory*> memories = mMemories.getObjects();
1547     std::vector<bool> isUsedAsInput(memories.size(), false);
1548     std::vector<bool> isUsedAsOutput(memories.size(), false);
1549     std::vector<std::unique_ptr<RuntimeMemory>> blobAhwbs;
1550 
1551     // Mark the input and output usages.
1552     for (auto& input : mInputs) {
1553         if (input.state() == ModelArgumentInfo::MEMORY) {
1554             const uint32_t poolIndex = input.locationAndLength().poolIndex;
1555             isUsedAsInput[poolIndex] = true;
1556         }
1557     }
1558     for (auto& output : mOutputs) {
1559         if (output.state() == ModelArgumentInfo::MEMORY) {
1560             const uint32_t poolIndex = output.locationAndLength().poolIndex;
1561             // Cannot allocate output buffers with unknown shapes.
1562             if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1563                 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1564                               "has unknown shape.";
1565                 return {ANEURALNETWORKS_OP_FAILED, {}, {}};
1566             }
1567             isUsedAsOutput[poolIndex] = true;
1568         }
1569     }
1570 
1571     // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1572     for (uint32_t i = 0; i < memories.size(); i++) {
1573         const RuntimeMemory* memory = mMemories[i];
1574         if (memory->getIBuffer() != nullptr) {
1575             const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1576             auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1577             if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1578                 return {nAhwb, {}, {}};
1579             }
1580             if (isUsedAsInput[i]) {
1581                 n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory());
1582                 if (n != ANEURALNETWORKS_NO_ERROR) {
1583                     return {n, {}, {}};
1584                 }
1585             }
1586             memories[i] = blobAhwb.get();
1587             blobAhwbs.push_back(std::move(blobAhwb));
1588         }
1589     }
1590 
1591     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1592     const OptionalDuration loopTimeoutDuration =
1593             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1594     auto [nExecute, outputShapes, timing] = preparedModel->execute(
1595             mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration);
1596     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1597     if (nExecute != ANEURALNETWORKS_NO_ERROR) {
1598         return {nExecute, std::move(outputShapes), timing};
1599     }
1600 
1601     // Write back to output device memories.
1602     for (uint32_t i = 0; i < memories.size(); i++) {
1603         const RuntimeMemory* memory = mMemories[i];
1604         if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1605             n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {});
1606             if (n != ANEURALNETWORKS_NO_ERROR) {
1607                 return {n, {}, {}};
1608             }
1609         }
1610     }
1611     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1612 }
1613 
1614 }  // namespace nn
1615 }  // namespace android
1616