1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Manager"
18
19 #include "Manager.h"
20
21 #include <CpuExecutor.h>
22 #include <LegacyUtils.h>
23 #include <MetaModel.h>
24 #include <Tracing.h>
25 #include <nnapi/IBurst.h>
26 #include <nnapi/IDevice.h>
27 #include <nnapi/IExecution.h>
28 #include <nnapi/IPreparedModel.h>
29 #include <nnapi/SharedMemory.h>
30 #include <nnapi/Types.h>
31 #include <nnapi/Validation.h>
32
33 #include <algorithm>
34 #include <functional>
35 #include <iterator>
36 #include <map>
37 #include <memory>
38 #include <string>
39 #include <tuple>
40 #include <utility>
41 #include <vector>
42
43 #include "ExecutionCallback.h"
44 #include "FeatureLevel.h"
45 #include "Memory.h"
46 #include "ModelArgumentInfo.h"
47 #include "TypeManager.h"
48
49 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
50 #include <build/version.h>
51 #include <cutils/native_handle.h>
52 #include <nnapi/hal/1.3/Buffer.h>
53 #include <nnapi/hal/Service.h>
54
55 #include "AppInfoFetcher.h"
56 #endif // NN_COMPATIBILITY_LIBRARY_BUILD
57
58 namespace android {
59 namespace nn {
60
61 // A Device with actual underlying driver
62 class DriverDevice : public Device {
63 public:
64 // Create a DriverDevice from a name and a DeviceFactory function.
65 // Returns nullptr on failure.
66 static std::shared_ptr<DriverDevice> create(SharedDevice device, bool isUpdatable = false);
67
68 // Prefer using DriverDevice::create
69 explicit DriverDevice(SharedDevice device, bool isUpdatable);
70
getName() const71 const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const72 const std::string& getVersionString() const override { return kInterface->getVersionString(); }
73 int64_t getFeatureLevel() const override;
getType() const74 int32_t getType() const override { return static_cast<int32_t>(kInterface->getType()); }
isUpdatable() const75 bool isUpdatable() const override { return kIsUpdatable; }
getSupportedExtensions() const76 const std::vector<Extension>& getSupportedExtensions() const override {
77 return kInterface->getSupportedExtensions();
78 }
79 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getCapabilities() const80 const Capabilities& getCapabilities() const override { return kInterface->getCapabilities(); }
getPerformance(OperandType type) const81 Capabilities::PerformanceInfo getPerformance(OperandType type) const override {
82 return getCapabilities().operandPerformance.lookup(type);
83 }
getRelaxedFloat32toFloat16PerformanceScalar() const84 Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
85 return getCapabilities().relaxedFloat32toFloat16PerformanceScalar;
86 }
getRelaxedFloat32toFloat16PerformanceTensor() const87 Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
88 return getCapabilities().relaxedFloat32toFloat16PerformanceTensor;
89 }
getIfPerformance() const90 Capabilities::PerformanceInfo getIfPerformance() const override {
91 return getCapabilities().ifPerformance;
92 }
getWhilePerformance() const93 Capabilities::PerformanceInfo getWhilePerformance() const override {
94 return getCapabilities().whilePerformance;
95 }
getNumberOfCacheFilesNeeded() const96 std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
97 return kInterface->getNumberOfCacheFilesNeeded();
98 }
isCachingSupported() const99 bool isCachingSupported() const override {
100 // Caching is supported if either of numModelCache or numDataCache is greater than 0.
101 const auto [numModelCacheFiles, numDataCacheFiles] = getNumberOfCacheFilesNeeded();
102 return numModelCacheFiles > 0 || numDataCacheFiles > 0;
103 }
wait() const104 int wait() const override {
105 auto result = kInterface->wait();
106 if (!result.ok()) {
107 LOG(ERROR) << "DriverDevice::wait error: " << result.error().message;
108 return convertErrorStatusToResultCode(result.error().code);
109 }
110 return ANEURALNETWORKS_NO_ERROR;
111 }
112
113 std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
114 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
115 const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
116 const std::optional<CacheToken>& maybeToken) const override;
117
118 std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
119 OperandType) const override;
120
121 private:
122 const SharedDevice kInterface;
123 const bool kIsUpdatable;
124
125 GeneralResult<std::vector<bool>> getSupportedOperationsImpl(const MetaModel& metaModel) const;
126 GeneralResult<SharedPreparedModel> prepareModelFromCacheInternal(
127 const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
128 const CacheToken& token) const;
129
130 #ifdef NN_DEBUGGABLE
131 // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
132 // 0 - all operations reported by IDevice::getSupportedOperations() supported
133 // 1 - some operations reported by IDevice::getSupportedOperations() supported
134 uint32_t mSupported = 0;
135 #endif // NN_DEBUGGABLE
136 };
137
138 // A RuntimePreparedModel with underlying IPreparedModel instance return by actual driver.
139 class DriverPreparedModel : public RuntimePreparedModel {
140 public:
DriverPreparedModel(const Device * device,const SharedPreparedModel & preparedModel)141 DriverPreparedModel(const Device* device, const SharedPreparedModel& preparedModel)
142 : mDevice(device), mPreparedModel(preparedModel) {
143 CHECK(mDevice != nullptr);
144 CHECK(mPreparedModel != nullptr);
145 }
146
getDevice() const147 const Device* getDevice() const override { return mDevice; }
getInterface() const148 SharedPreparedModel getInterface() const override { return mPreparedModel; }
149
150 std::tuple<int, std::vector<OutputShape>, Timing> execute(
151 const std::vector<ModelArgumentInfo>& inputs,
152 const std::vector<ModelArgumentInfo>& outputs,
153 const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
154 MeasureTiming measure, const OptionalTimePoint& deadline,
155 const OptionalDuration& loopTimeoutDuration) const override;
156
157 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
158 const std::vector<ModelArgumentInfo>& inputs,
159 const std::vector<ModelArgumentInfo>& outputs,
160 const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
161 MeasureTiming measure, const OptionalTimePoint& deadline,
162 const OptionalDuration& loopTimeoutDuration,
163 const OptionalDuration& timeoutDurationAfterFence) const override;
164
165 std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
166 const std::vector<ModelArgumentInfo>& inputs,
167 const std::vector<ModelArgumentInfo>& outputs,
168 const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
169 const OptionalDuration& loopTimeoutDuration) const override;
170
configureExecutionBurst() const171 GeneralResult<SharedBurst> configureExecutionBurst() const override {
172 return mPreparedModel->configureExecutionBurst();
173 }
174
getMemoryPreference() const175 MemoryPreference getMemoryPreference() const override {
176 if (mDevice->getFeatureLevel() >= ANEURALNETWORKS_FEATURE_LEVEL_5) {
177 return {kDefaultRequestMemoryAlignment, kDefaultRequestMemoryPadding};
178 } else {
179 // We are not able to pass memory padding information to HIDL drivers, so return the
180 // minimum padding.
181 return {kDefaultRequestMemoryAlignment, kMinMemoryPadding};
182 }
183 }
184
185 private:
186 const Device* mDevice;
187 const SharedPreparedModel mPreparedModel;
188 };
189
190 class DriverExecution : public RuntimeExecution {
191 public:
DriverExecution(SharedExecution execution,Request request,std::vector<const RuntimeMemory * > memories,MeasureTiming measure,OptionalDuration loopTimeoutDuration,int64_t deviceFeatureLevel)192 DriverExecution(SharedExecution execution, Request request,
193 std::vector<const RuntimeMemory*> memories, MeasureTiming measure,
194 OptionalDuration loopTimeoutDuration, int64_t deviceFeatureLevel)
195 : kExecution(std::move(execution)),
196 kRequest(std::move(request)),
197 kMemories(std::move(memories)),
198 kMeasure(measure),
199 kLoopTimeoutDuration(std::move(loopTimeoutDuration)),
200 kDeviceFeatureLevel(deviceFeatureLevel) {
201 CHECK(kExecution != nullptr);
202 }
203
204 std::tuple<int, std::vector<OutputShape>, Timing> compute(
205 const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
206
207 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
208 const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
209 const OptionalDuration& timeoutDurationAfterFence) const override;
210
211 private:
212 const SharedExecution kExecution;
213
214 // For burst execution.
215 const Request kRequest;
216 const std::vector<const RuntimeMemory*> kMemories;
217 const MeasureTiming kMeasure;
218 const OptionalDuration kLoopTimeoutDuration;
219 mutable std::map<const IBurst*, SharedExecution> mCachedBurstExecutions;
220
221 // For fenced execution.
222 const int64_t kDeviceFeatureLevel;
223 };
224
DriverDevice(SharedDevice device,bool isUpdatable)225 DriverDevice::DriverDevice(SharedDevice device, bool isUpdatable)
226 : kInterface(std::move(device)), kIsUpdatable(isUpdatable) {
227 CHECK(kInterface != nullptr);
228 #ifdef NN_DEBUGGABLE
229 static const char samplePrefix[] = "sample";
230 if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
231 mSupported = getProp("debug.nn.sample.supported");
232 }
233 #endif // NN_DEBUGGABLE
234 }
235
create(SharedDevice device,bool isUpdatable)236 std::shared_ptr<DriverDevice> DriverDevice::create(SharedDevice device, bool isUpdatable) {
237 if (device == nullptr) {
238 LOG(ERROR) << "DriverDevice::create called with nullptr";
239 return nullptr;
240 }
241
242 return std::make_shared<DriverDevice>(std::move(device), isUpdatable);
243 }
244
getFeatureLevel() const245 int64_t DriverDevice::getFeatureLevel() const {
246 Version featureLevel = kInterface->getFeatureLevel();
247 switch (featureLevel) {
248 case Version::ANDROID_OC_MR1:
249 return ANEURALNETWORKS_FEATURE_LEVEL_1;
250 case Version::ANDROID_P:
251 return ANEURALNETWORKS_FEATURE_LEVEL_2;
252 case Version::ANDROID_Q:
253 return ANEURALNETWORKS_FEATURE_LEVEL_3;
254 case Version::ANDROID_R:
255 return ANEURALNETWORKS_FEATURE_LEVEL_4;
256 case Version::ANDROID_S:
257 return ANEURALNETWORKS_FEATURE_LEVEL_5;
258 case Version::CURRENT_RUNTIME:
259 break;
260 }
261 LOG(FATAL) << "Unsupported driver feature level: " << featureLevel;
262 return -1;
263 }
264
getSupportedOperationsImpl(const MetaModel & metaModel) const265 GeneralResult<std::vector<bool>> DriverDevice::getSupportedOperationsImpl(
266 const MetaModel& metaModel) const {
267 const auto featureLevel = kInterface->getFeatureLevel();
268 const auto slice = metaModel.getSlice(featureLevel);
269 if (!slice.has_value()) {
270 return NN_ERROR() << "getSlice(" << featureLevel << ") failed";
271 }
272
273 const auto& [sliceModel, slicedModelOperationIndexToModelOperationIndex] = *slice;
274 const std::vector<bool> supported = NN_TRY(kInterface->getSupportedOperations(sliceModel));
275 const uint32_t slicedOperationCount = sliceModel.main.operations.size();
276 if (supported.size() != slicedOperationCount) {
277 return NN_ERROR() << "IDevice::getSupportedOperations returned a vector of length "
278 << supported.size() << " when expecting " << slicedOperationCount;
279 }
280
281 const Model& model = metaModel.getModel();
282 const uint32_t operationCount = model.main.operations.size();
283 std::vector<bool> remappedSupported(operationCount, false);
284 for (size_t i = 0; i < supported.size(); ++i) {
285 if (supported[i]) {
286 remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true;
287 }
288 }
289 return remappedSupported;
290 }
291
getSupportedOperations(const MetaModel & metaModel) const292 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
293 const Model& model = metaModel.getModel();
294
295 auto result = getSupportedOperationsImpl(metaModel);
296 if (!result.ok()) {
297 LOG(ERROR) << "getSupportedOperations failed with code " << result.error().code << ": "
298 << result.error().message;
299 // Set the supported operation vectors to all false, so we won't use this driver.
300 return std::vector<bool>(model.main.operations.size(), false);
301 }
302
303 std::vector<bool>& supportedOperations = result.value();
304 #ifdef NN_DEBUGGABLE
305 if (mSupported != 1) {
306 return supportedOperations;
307 }
308
309 const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
310 for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
311 if (!supportedOperations[operationIndex]) {
312 continue;
313 }
314
315 uint32_t accumulator = baseAccumulator;
316 const Operation& operation = model.main.operations[operationIndex];
317 accumulator ^= static_cast<uint32_t>(operation.type);
318 auto accumulateOperands = [&model, &accumulator](const std::vector<uint32_t>& operands) {
319 for (uint32_t operandIndex : operands) {
320 const Operand& operand = model.main.operands[operandIndex];
321 accumulator ^= static_cast<uint32_t>(operand.type);
322 accumulator ^= operand.dimensions.size();
323 for (const Dimension& dimension : operand.dimensions) {
324 accumulator ^= dimension;
325 if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY ||
326 operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE ||
327 operand.lifetime == Operand::LifeTime::POINTER) {
328 accumulator ^= 1;
329 }
330 }
331 }
332 };
333 accumulateOperands(operation.inputs);
334 accumulateOperands(operation.outputs);
335 if (accumulator & 1) {
336 supportedOperations[operationIndex] = false;
337 }
338 }
339 #endif // NN_DEBUGGABLE
340
341 return supportedOperations;
342 }
343
344 // Opens a cache file for reading and writing and returns a shared handle.
createCacheHandle(const std::string & filename,bool createIfNotExist)345 static GeneralResult<SharedHandle> createCacheHandle(const std::string& filename,
346 bool createIfNotExist) {
347 auto fd = base::unique_fd(open(filename.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR,
348 S_IRUSR | S_IWUSR));
349 if (fd.get() == -1) {
350 return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
351 << "Failed to " << (createIfNotExist ? "open or create" : "open") << " cache file "
352 << filename;
353 }
354 std::vector<base::unique_fd> fds;
355 fds.push_back(std::move(fd));
356 return std::make_shared<const Handle>(Handle{
357 .fds = std::move(fds),
358 .ints = {},
359 });
360 }
361
362 // Opens a list of cache files and returns a vector of shared handles. The files
363 // are always opened with both read and write permissions.
createCacheHandleVec(uint32_t numCacheFiles,const std::string & baseFilename,bool createIfNotExist)364 static GeneralResult<std::vector<SharedHandle>> createCacheHandleVec(
365 uint32_t numCacheFiles, const std::string& baseFilename, bool createIfNotExist) {
366 CHECK(numCacheFiles <= kMaxNumberOfCacheFiles);
367 std::vector<SharedHandle> handles;
368 handles.reserve(numCacheFiles);
369 for (uint32_t i = 0; i < numCacheFiles; i++) {
370 std::string filename = baseFilename + std::to_string(i);
371 VLOG(COMPILATION) << "Cache " << i << ": " << filename;
372 handles.push_back(NN_TRY(createCacheHandle(filename, createIfNotExist)));
373 }
374 return handles;
375 }
376
377 // Maps a token to cache file names and returns a pair of vectors of shared
378 // handles to the opened files.
getCacheHandles(const CacheInfo & cacheInfo,const CacheToken & token,const std::pair<uint32_t,uint32_t> & numCacheFiles,bool createIfNotExist)379 static GeneralResult<CacheHandles> getCacheHandles(
380 const CacheInfo& cacheInfo, const CacheToken& token,
381 const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist) {
382 if (const auto* cacheHandles = std::get_if<CacheHandles>(&cacheInfo.variant)) {
383 if (cacheHandles->modelCache.size() != numCacheFiles.first) {
384 return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
385 << "Expected " << numCacheFiles.first << " model cache handles, got "
386 << cacheHandles->modelCache.size();
387 }
388 if (cacheHandles->dataCache.size() != numCacheFiles.second) {
389 return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
390 << "Expected " << numCacheFiles.second << " data cache handles, got "
391 << cacheHandles->dataCache.size();
392 }
393 return *cacheHandles;
394 }
395
396 // The filename includes kByteSizeOfCacheToken * 2 characters for token,
397 // and 1 character for model/data cache identifier.
398 std::string filename(kByteSizeOfCacheToken * 2 + 1, '0');
399 for (uint32_t i = 0; i < kByteSizeOfCacheToken; i++) {
400 filename[i * 2] = 'A' + (token[i] & 0x0F);
401 filename[i * 2 + 1] = 'A' + (token[i] >> 4);
402 }
403
404 const auto& cacheDir = std::get<CacheDir>(cacheInfo.variant);
405 CHECK(cacheDir.empty() || cacheDir.back() == '/');
406 std::string cacheFileName = cacheDir + filename;
407 const uint32_t cacheTypeIdentifierIndex = cacheDir.size() + kByteSizeOfCacheToken * 2;
408
409 cacheFileName[cacheTypeIdentifierIndex] = '1';
410 std::vector<SharedHandle> modelCache =
411 NN_TRY(createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist));
412
413 cacheFileName[cacheTypeIdentifierIndex] = '2';
414 std::vector<SharedHandle> dataCache =
415 NN_TRY(createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist));
416
417 return CacheHandles{
418 .modelCache = std::move(modelCache),
419 .dataCache = std::move(dataCache),
420 };
421 }
422
prepareModelFromCacheInternal(const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,const CacheToken & token) const423 GeneralResult<SharedPreparedModel> DriverDevice::prepareModelFromCacheInternal(
424 const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
425 const CacheToken& token) const {
426 // Get cache files if they exist, otherwise return from the function early.
427 auto cache = NN_TRY(getCacheHandles(cacheInfo, token, kInterface->getNumberOfCacheFilesNeeded(),
428 /*createIfNotExist=*/false));
429 return kInterface->prepareModelFromCache(deadline, cache.modelCache, cache.dataCache, token);
430 }
431
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,const std::optional<CacheToken> & maybeToken) const432 std::pair<int, std::shared_ptr<RuntimePreparedModel>> DriverDevice::prepareModel(
433 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
434 const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
435 const std::optional<CacheToken>& maybeToken) const {
436 // Attempt to compile from cache if token is present.
437 if (maybeToken.has_value()) {
438 auto result = prepareModelFromCacheInternal(deadline, cacheInfo, *maybeToken);
439 if (result.has_value()) {
440 return {ANEURALNETWORKS_NO_ERROR,
441 std::make_shared<DriverPreparedModel>(this, std::move(result).value())};
442 } else {
443 LOG(ERROR) << "prepareModelFromCache failure (" << result.error().code
444 << "): " << result.error().message;
445 }
446 }
447
448 // Get cache files if they exist, otherwise create them.
449 CacheHandles cache;
450 if (maybeToken.has_value()) {
451 auto result =
452 getCacheHandles(cacheInfo, *maybeToken, kInterface->getNumberOfCacheFilesNeeded(),
453 /*createIfNotExist=*/true);
454 if (result.has_value()) {
455 cache = std::move(result).value();
456 } else {
457 LOG(ERROR) << "getCacheHandles failure (" << result.error().code
458 << "): " << result.error().message;
459 }
460 }
461
462 // Get the token if it exists, otherwise get a null token.
463 static constexpr CacheToken kNullToken = {};
464 const CacheToken token = maybeToken.value_or(kNullToken);
465
466 // Fallback to full compilation (possibly with token) if
467 // prepareModelFromCache could not be used or failed.
468 const Model model = makeModel();
469 auto result = kInterface->prepareModel(model, preference, priority, deadline, cache.modelCache,
470 cache.dataCache, token);
471 if (!result.ok()) {
472 LOG(ERROR) << "IDevice::prepareModel() error: " << result.error().message;
473 return {convertErrorStatusToResultCode(result.error().code), nullptr};
474 }
475 SharedPreparedModel preparedModel = std::move(result).value();
476 CHECK(preparedModel != nullptr)
477 << "IDevice::prepareModel() returned nullptr without error code";
478 return {ANEURALNETWORKS_NO_ERROR,
479 std::make_shared<DriverPreparedModel>(this, std::move(preparedModel))};
480 }
481
allocate(const MemoryDescriptor & desc,OperandType) const482 std::pair<int, std::unique_ptr<RuntimeMemory>> DriverDevice::allocate(const MemoryDescriptor& desc,
483 OperandType) const {
484 const BufferDesc bufferDesc = {.dimensions = desc.dimensions};
485 std::vector<SharedPreparedModel> preparedModels(desc.preparedModels.size());
486 std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
487 [](const auto* preparedModel) {
488 const auto versionedPreparedModel = preparedModel->getInterface();
489 CHECK(versionedPreparedModel != nullptr);
490 return versionedPreparedModel;
491 });
492 auto result =
493 kInterface->allocate(bufferDesc, preparedModels, desc.inputRoles, desc.outputRoles);
494 if (!result.ok()) {
495 LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
496 << " failed!";
497 return {convertErrorStatusToResultCode(result.error().code), nullptr};
498 }
499 return MemoryFromDevice::create(std::move(result).value());
500 }
501
createDriverRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories)502 static Request createDriverRequest(const std::vector<ModelArgumentInfo>& inputs,
503 const std::vector<ModelArgumentInfo>& outputs,
504 const std::vector<const RuntimeMemory*>& memories) {
505 Request request;
506 request.inputs.reserve(inputs.size());
507 std::transform(inputs.begin(), inputs.end(), std::back_inserter(request.inputs),
508 [](const auto& input) { return input.createRequestArgument(); });
509 request.outputs.reserve(outputs.size());
510 std::transform(outputs.begin(), outputs.end(), std::back_inserter(request.outputs),
511 [](const auto& output) { return output.createRequestArgument(); });
512 request.pools.reserve(memories.size());
513 std::transform(memories.begin(), memories.end(), std::back_inserter(request.pools),
514 [](const RuntimeMemory* memory) { return memory->getMemoryPool(); });
515 return request;
516 }
517
518 // Perform computation on an actual device driver.
519 //
520 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
521 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
522 // execution, and the output pointer data will be copied out from the output pool after the
523 // execution.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const SharedBurst & burstController,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration) const524 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
525 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
526 const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
527 MeasureTiming measure, const OptionalTimePoint& deadline,
528 const OptionalDuration& loopTimeoutDuration) const {
529 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
530
531 auto request = createDriverRequest(inputs, outputs, memories);
532
533 NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::execute::execute");
534
535 ExecutionResult<std::pair<std::vector<OutputShape>, Timing>> result;
536
537 // compute using burst if present, otherwise compute from IPreparedModel
538 const bool burstCompute = (burstController != nullptr);
539 if (burstCompute) {
540 for (const RuntimeMemory* memory : memories) {
541 const auto pool = memory->getMemoryPool();
542 if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
543 auto cacheHold = burstController->cacheMemory(*maybeMemory);
544 memory->hold(cacheHold);
545 }
546 }
547
548 VLOG(EXECUTION) << "Before burstController->execute() " << SHOW_IF_DEBUG(request);
549
550 result = burstController->execute(request, measure, deadline, loopTimeoutDuration);
551 } else {
552 result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
553 }
554
555 int n = ANEURALNETWORKS_OP_FAILED;
556 std::vector<OutputShape> outputShapes;
557 Timing timing;
558
559 if (result.ok()) {
560 n = ANEURALNETWORKS_NO_ERROR;
561 std::tie(outputShapes, timing) = std::move(result).value();
562 } else {
563 auto [message, code, returnedOutputShapes] = std::move(result).error();
564 VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
565 LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
566 << "::execute(...) error: " << message;
567 n = convertErrorStatusToResultCode(code);
568 if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
569 outputShapes = std::move(returnedOutputShapes);
570 }
571 return {n, std::move(outputShapes), timing};
572 }
573
574 VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
575 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
576 }
577
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const std::vector<int> & waitFor,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration,const OptionalDuration & timeoutDurationAfterFence) const578 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverPreparedModel::executeFenced(
579 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
580 const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
581 MeasureTiming measure, const OptionalTimePoint& deadline,
582 const OptionalDuration& loopTimeoutDuration,
583 const OptionalDuration& timeoutDurationAfterFence) const {
584 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
585 CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
586
587 auto request = createDriverRequest(inputs, outputs, memories);
588
589 NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::executeFenced");
590
591 std::vector<SyncFence> waitForHandles;
592 waitForHandles.reserve(waitFor.size());
593 for (int fd : waitFor) {
594 int dupFd = dup(fd);
595 if (dupFd < 0) {
596 LOG(ERROR) << "Unable to dup the file descriptor";
597 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
598 }
599 waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
600 }
601
602 SyncFence syncFence = SyncFence::createAsSignaled();
603 ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
604 Timing timing = {};
605 if (mDevice->getFeatureLevel() >= kHalVersionV1_3ToApi.featureLevel) {
606 auto result = mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
607 loopTimeoutDuration, timeoutDurationAfterFence);
608 if (!result.ok()) {
609 LOG(ERROR) << "IPreparedModel::executeFenced() error: " << result.error().message;
610 VLOG(EXECUTION) << "**executeFenced failed**";
611 return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
612 }
613 std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
614 } else {
615 // Fallback to synchronous execution if executeFenced is not supported.
616 // First wait for all sync fences to be ready.
617 LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
618 for (const auto& fence : waitForHandles) {
619 if (!fence.hasFd() || fence.getFd() < 0) {
620 return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
621 }
622 auto r = fence.syncWait({/* no timeout */});
623 if (r != SyncFence::FenceState::SIGNALED) {
624 LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
625 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
626 }
627 }
628 auto result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
629 if (!result.ok()) {
630 LOG(ERROR) << "IPreparedModel::execute() error: " << result.error().message;
631 return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
632 }
633 std::tie(std::ignore, timing) = result.value();
634 }
635
636 int syncFenceFd = -1;
637 if (syncFence.hasFd()) {
638 syncFenceFd = dup(syncFence.getFd());
639 if (syncFenceFd < 0) {
640 LOG(ERROR) << "Failed to dup the file descriptor";
641 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
642 }
643 }
644
645 VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
646 return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
647 }
648
createReusableExecution(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,MeasureTiming measure,const OptionalDuration & loopTimeoutDuration) const649 std::pair<int, std::shared_ptr<RuntimeExecution>> DriverPreparedModel::createReusableExecution(
650 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
651 const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
652 const OptionalDuration& loopTimeoutDuration) const {
653 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::createReusableExecution");
654
655 auto request = createDriverRequest(inputs, outputs, memories);
656 auto result = mPreparedModel->createReusableExecution(request, measure, loopTimeoutDuration);
657 if (!result.ok()) {
658 LOG(ERROR) << "IPreparedModel::createReusableExecution() error: " << result.error().message;
659 const int n = convertErrorStatusToResultCode(result.error().code);
660 return {n, nullptr};
661 }
662 auto execution = std::make_shared<DriverExecution>(
663 std::move(result).value(), std::move(request), memories, measure, loopTimeoutDuration,
664 mDevice->getFeatureLevel());
665 return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
666 }
667
compute(const SharedBurst & burstController,const OptionalTimePoint & deadline) const668 std::tuple<int, std::vector<OutputShape>, Timing> DriverExecution::compute(
669 const SharedBurst& burstController, const OptionalTimePoint& deadline) const {
670 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::compute");
671
672 // compute using burst if present, otherwise compute from IPreparedModel
673 SharedExecution execution;
674 const bool burstCompute = (burstController != nullptr);
675 if (burstCompute) {
676 // create a reusable burst execution if the controller is not seen before
677 auto burstExecution = mCachedBurstExecutions.find(burstController.get());
678 if (burstExecution == mCachedBurstExecutions.end()) {
679 for (const RuntimeMemory* memory : kMemories) {
680 const auto pool = memory->getMemoryPool();
681 if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
682 auto cacheHold = burstController->cacheMemory(*maybeMemory);
683 memory->hold(cacheHold);
684 }
685 }
686 auto createResult = burstController->createReusableExecution(kRequest, kMeasure,
687 kLoopTimeoutDuration);
688 if (!createResult.ok()) {
689 LOG(ERROR) << "IBurst::createReusableExecution() error: "
690 << createResult.error().message;
691 const int n = convertErrorStatusToResultCode(createResult.error().code);
692 return {n, {}, {}};
693 }
694 execution = std::move(createResult).value();
695 mCachedBurstExecutions.emplace(burstController.get(), execution);
696 } else {
697 execution = burstExecution->second;
698 }
699 VLOG(EXECUTION) << "Before mBurstExecution->compute() " << SHOW_IF_DEBUG(kRequest);
700 } else {
701 execution = kExecution;
702 }
703
704 CHECK(execution != nullptr);
705 auto result = execution->compute(deadline);
706 if (!result.ok()) {
707 auto [message, code, returnedOutputShapes] = std::move(result).error();
708 int n = convertErrorStatusToResultCode(code);
709 VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
710 LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
711 << "::execute(...) error: " << message;
712 if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
713 return {n, std::move(returnedOutputShapes), {}};
714 }
715 return {n, {}, {}};
716 }
717
718 VLOG(EXECUTION) << "DriverExecution::compute completed";
719 auto [outputShapes, timing] = std::move(result).value();
720 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
721 }
722
computeFenced(const std::vector<int> & waitFor,const OptionalTimePoint & deadline,const OptionalDuration & timeoutDurationAfterFence) const723 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverExecution::computeFenced(
724 const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
725 const OptionalDuration& timeoutDurationAfterFence) const {
726 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::computeFenced");
727 CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
728
729 std::vector<SyncFence> waitForHandles;
730 waitForHandles.reserve(waitFor.size());
731 for (int fd : waitFor) {
732 int dupFd = dup(fd);
733 if (dupFd < 0) {
734 LOG(ERROR) << "Unable to dup the file descriptor";
735 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
736 }
737 waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
738 }
739
740 SyncFence syncFence = SyncFence::createAsSignaled();
741 ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
742 Timing timing = {};
743 if (kDeviceFeatureLevel >= kHalVersionV1_3ToApi.featureLevel) {
744 auto result =
745 kExecution->computeFenced(waitForHandles, deadline, timeoutDurationAfterFence);
746 if (!result.ok()) {
747 LOG(ERROR) << "IExecution::computeFenced() error: " << result.error().message;
748 VLOG(EXECUTION) << "**computeFenced failed**";
749 return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
750 }
751 std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
752 } else {
753 // Fallback to synchronous execution if computeFenced is not supported.
754 // First wait for all sync fences to be ready.
755 LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
756 for (const auto& fence : waitForHandles) {
757 if (!fence.hasFd() || fence.getFd() < 0) {
758 return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
759 }
760 auto r = fence.syncWait({/* no timeout */});
761 if (r != SyncFence::FenceState::SIGNALED) {
762 LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
763 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
764 }
765 }
766 auto result = kExecution->compute(deadline);
767 if (!result.ok()) {
768 LOG(ERROR) << "IExecution::compute() error: " << result.error().message;
769 return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
770 }
771 std::tie(std::ignore, timing) = result.value();
772 }
773
774 int syncFenceFd = -1;
775 if (syncFence.hasFd()) {
776 syncFenceFd = dup(syncFence.getFd());
777 if (syncFenceFd < 0) {
778 LOG(ERROR) << "Failed to dup the file descriptor";
779 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
780 }
781 }
782
783 VLOG(EXECUTION) << "DriverExecution::computeFenced completed";
784 return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
785 }
786
createCpuCapabilities()787 static Capabilities createCpuCapabilities() {
788 constexpr Capabilities::PerformanceInfo kPerf = {.execTime = 1.0f, .powerUsage = 1.0f};
789 constexpr OperandType operandTypes[] = {
790 OperandType::FLOAT32,
791 OperandType::INT32,
792 OperandType::UINT32,
793 OperandType::TENSOR_FLOAT32,
794 OperandType::TENSOR_INT32,
795 OperandType::TENSOR_QUANT8_ASYMM,
796 OperandType::BOOL,
797 OperandType::TENSOR_QUANT16_SYMM,
798 OperandType::TENSOR_FLOAT16,
799 OperandType::TENSOR_BOOL8,
800 OperandType::FLOAT16,
801 OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL,
802 OperandType::TENSOR_QUANT16_ASYMM,
803 OperandType::TENSOR_QUANT8_SYMM,
804 OperandType::TENSOR_QUANT8_ASYMM_SIGNED,
805 };
806
807 std::vector<Capabilities::OperandPerformance> operandPerformance;
808 operandPerformance.reserve(std::size(operandTypes));
809 std::transform(std::begin(operandTypes), std::end(operandTypes),
810 std::back_inserter(operandPerformance), [kPerf](OperandType type) {
811 return Capabilities::OperandPerformance{.type = type, .info = kPerf};
812 });
813
814 auto table =
815 Capabilities::OperandPerformanceTable::create(std::move(operandPerformance)).value();
816
817 return Capabilities{
818 .relaxedFloat32toFloat16PerformanceScalar = kPerf,
819 .relaxedFloat32toFloat16PerformanceTensor = kPerf,
820 .operandPerformance = std::move(table),
821 .ifPerformance = kPerf,
822 .whilePerformance = kPerf,
823 };
824 }
825
826 // A special abstracted device for the CPU. Only one instance of this class will exist.
827 // Use get() to retrieve it.
828 class CpuDevice : public Device {
829 public:
830 // Returns the singleton CPU fallback device.
get()831 static std::shared_ptr<CpuDevice> get() {
832 static std::shared_ptr<CpuDevice> instance(new CpuDevice);
833 return instance;
834 }
835
getName() const836 const std::string& getName() const override { return kName; }
getVersionString() const837 const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const838 int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const839 int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
isUpdatable() const840 bool isUpdatable() const override { return false; }
getSupportedExtensions() const841 const std::vector<Extension>& getSupportedExtensions() const override {
842 return kSupportedExtensions;
843 }
844 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getCapabilities() const845 const Capabilities& getCapabilities() const override { return kCapabilities; }
getPerformance(OperandType) const846 Capabilities::PerformanceInfo getPerformance(OperandType) const override {
847 return kPerformance;
848 }
getRelaxedFloat32toFloat16PerformanceScalar() const849 Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
850 return kPerformance;
851 }
getRelaxedFloat32toFloat16PerformanceTensor() const852 Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
853 return kPerformance;
854 }
getIfPerformance() const855 Capabilities::PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const856 Capabilities::PerformanceInfo getWhilePerformance() const override { return kPerformance; }
getNumberOfCacheFilesNeeded() const857 std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
858 return {/*numModelCache=*/0, /*numDataCache=*/0};
859 }
isCachingSupported() const860 bool isCachingSupported() const override { return false; }
wait() const861 int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
862
863 std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
864 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
865 const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
866 const std::optional<CacheToken>& maybeToken) const override;
867
868 std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
869 OperandType type) const override;
870
871 private:
872 CpuDevice() = default;
873 const int64_t kFeatureLevel = kCurrentNNAPIRuntimeFeatureLevel;
874 const std::string kName = "nnapi-reference";
875 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
876 const std::string kVersionString = build::GetBuildNumber();
877 #else
878 const std::string kVersionString = "UNKNOWN";
879 #endif // NN_COMPATIBILITY_LIBRARY_BUILD
880 // Since the performance is a ratio compared to the CPU performance,
881 // by definition the performance of the CPU is 1.0.
882 const Capabilities::PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
883 const Capabilities kCapabilities = createCpuCapabilities();
884 const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
885 };
886
887 // A special abstracted RuntimePreparedModel for the CPU, constructed by CpuDevice.
888 class CpuPreparedModel : public RuntimePreparedModel {
889 public:
890 // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
891 // a prepared model object if successfully created. Returns an error code
892 // and nullptr otherwise.
893 static std::pair<int, std::shared_ptr<RuntimePreparedModel>> create(Model model);
894
getDevice() const895 const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const896 SharedPreparedModel getInterface() const override { return nullptr; }
897
898 std::tuple<int, std::vector<OutputShape>, Timing> execute(
899 const std::vector<ModelArgumentInfo>& inputs,
900 const std::vector<ModelArgumentInfo>& outputs,
901 const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
902 MeasureTiming measure, const OptionalTimePoint& deadline,
903 const OptionalDuration& loopTimeoutDuration) const override;
904
configureExecutionBurst() const905 GeneralResult<SharedBurst> configureExecutionBurst() const override { return nullptr; }
906
907 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
908 const std::vector<ModelArgumentInfo>& inputs,
909 const std::vector<ModelArgumentInfo>& outputs,
910 const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
911 MeasureTiming measure, const OptionalTimePoint& deadline,
912 const OptionalDuration& loopTimeoutDuration,
913 const OptionalDuration& timeoutDurationAfterFence) const override;
914
915 std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
916 const std::vector<ModelArgumentInfo>& inputs,
917 const std::vector<ModelArgumentInfo>& outputs,
918 const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
919 const OptionalDuration& loopTimeoutDuration) const override;
920
getMemoryPreference() const921 MemoryPreference getMemoryPreference() const override {
922 return {kPreferredAlignment, kPreferredPadding};
923 }
924
925 // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)926 CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
927 : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
928
getModel() const929 const Model& getModel() const { return mModel; }
getModelPoolInfos() const930 const std::vector<RunTimePoolInfo>& getModelPoolInfos() const { return mModelPoolInfos; }
931
932 private:
933 // TFLite kernels prefers 64 bytes for padding and alignment.
934 static constexpr uint32_t kPreferredAlignment = 64;
935 static constexpr uint32_t kPreferredPadding = 64;
936
937 const Model mModel;
938 const std::vector<RunTimePoolInfo> mModelPoolInfos;
939 };
940
941 class CpuExecution : public RuntimeExecution {
942 public:
CpuExecution(const CpuPreparedModel & preparedModel,Request request,std::vector<RunTimePoolInfo> requestPoolInfos,OptionalDuration loopTimeoutDuration)943 CpuExecution(const CpuPreparedModel& preparedModel, Request request,
944 std::vector<RunTimePoolInfo> requestPoolInfos,
945 OptionalDuration loopTimeoutDuration)
946 : kPreparedModel(preparedModel),
947 kRequest(std::move(request)),
948 kRequestPoolInfos(std::move(requestPoolInfos)),
949 kLoopTimeoutDuration(std::move(loopTimeoutDuration)) {}
950
951 std::tuple<int, std::vector<OutputShape>, Timing> compute(
952 const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
953
954 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
955 const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
956 const OptionalDuration& timeoutDurationAfterFence) const override;
957
958 private:
959 const CpuPreparedModel& kPreparedModel;
960 Request kRequest;
961 std::vector<RunTimePoolInfo> kRequestPoolInfos;
962 const OptionalDuration kLoopTimeoutDuration;
963 };
964
getSupportedOperations(const MetaModel & metaModel) const965 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
966 const Model& model = metaModel.getModel();
967 const size_t count = model.main.operations.size();
968 std::vector<bool> result(count, false);
969 for (size_t i = 0; i < count; i++) {
970 // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
971 // We may want to use the slicer for CpuDevice just as we do for
972 // DriverDevice.
973 OperationType operationType = model.main.operations[i].type;
974 result[i] = !isExtension(operationType) && operationType != OperationType::OEM_OPERATION;
975 }
976 return result;
977 }
978
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const OptionalTimePoint & deadline,const CacheInfo &,const std::optional<CacheToken> & maybeToken) const979 std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuDevice::prepareModel(
980 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
981 const OptionalTimePoint& deadline, const CacheInfo& /*cacheInfo*/,
982 const std::optional<CacheToken>& maybeToken) const {
983 CHECK(!maybeToken.has_value())
984 << "Should never call prepareModel with cache information on CpuDevice";
985
986 const Model model = makeModel();
987 if (auto result = validate(model); !result.ok()) {
988 LOG(ERROR) << "Invalid Model: " << result.error();
989 return {ANEURALNETWORKS_OP_FAILED, nullptr};
990 }
991 if (auto result = validate(preference); !result.ok()) {
992 LOG(ERROR) << "Invalid ExecutionPreference: " << result.error();
993 return {ANEURALNETWORKS_OP_FAILED, nullptr};
994 }
995 if (auto result = validate(priority); !result.ok()) {
996 LOG(ERROR) << "Invalid Priority: " << result.error();
997 return {ANEURALNETWORKS_OP_FAILED, nullptr};
998 }
999 if (hasDeadlinePassed(deadline)) {
1000 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
1001 }
1002
1003 return CpuPreparedModel::create(model);
1004 }
1005
allocate(const MemoryDescriptor & desc,OperandType type) const1006 std::pair<int, std::unique_ptr<RuntimeMemory>> CpuDevice::allocate(const MemoryDescriptor& desc,
1007 OperandType type) const {
1008 uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
1009 if (size == 0) {
1010 LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
1011 return {ANEURALNETWORKS_OP_FAILED, nullptr};
1012 }
1013 return MemoryAshmem::create(size);
1014 }
1015
create(Model model)1016 std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuPreparedModel::create(Model model) {
1017 std::vector<RunTimePoolInfo> poolInfos;
1018 if (!setRunTimePoolInfosFromCanonicalMemories(&poolInfos, model.pools)) {
1019 return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
1020 }
1021
1022 std::shared_ptr<RuntimePreparedModel> preparedModel =
1023 std::make_shared<CpuPreparedModel>(std::move(model), std::move(poolInfos));
1024 return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
1025 }
1026
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration)1027 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
1028 const Model& model, const Request& request,
1029 const std::vector<RunTimePoolInfo>& modelPoolInfos,
1030 const std::vector<RunTimePoolInfo>& requestPoolInfos, const OptionalTimePoint& deadline,
1031 const OptionalDuration& loopTimeoutDuration) {
1032 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
1033 CpuExecutor executor;
1034 if (loopTimeoutDuration.has_value()) {
1035 executor.setLoopTimeout(loopTimeoutDuration->count());
1036 }
1037 if (deadline.has_value()) {
1038 executor.setDeadline(*deadline);
1039 }
1040 int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
1041 const auto& outputShapes = executor.getOutputShapes();
1042 return {err, outputShapes, {}};
1043 }
1044
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const std::vector<int> & waitFor,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration,const OptionalDuration & duration) const1045 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuPreparedModel::executeFenced(
1046 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1047 const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
1048 MeasureTiming measure, const OptionalTimePoint& deadline,
1049 const OptionalDuration& loopTimeoutDuration, const OptionalDuration& duration) const {
1050 VLOG(EXECUTION)
1051 << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
1052 for (int syncFd : waitFor) {
1053 if (syncFd > 0) {
1054 auto r = syncWait(syncFd, -1);
1055 if (r != FenceState::SIGNALED) {
1056 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
1057 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
1058 }
1059 }
1060 }
1061
1062 // Update deadline if the timeout duration is closer than the deadline.
1063 auto closestDeadline = deadline;
1064 if (duration.has_value()) {
1065 const auto timeoutDurationDeadline = makeDeadline(*duration);
1066 if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
1067 closestDeadline = timeoutDurationDeadline;
1068 }
1069 }
1070
1071 const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
1072 closestDeadline, loopTimeoutDuration);
1073 return {result, -1, nullptr, timing};
1074 }
1075
createCpuRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories)1076 static std::tuple<int, Request, std::vector<RunTimePoolInfo>> createCpuRequest(
1077 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1078 const std::vector<const RuntimeMemory*>& memories) {
1079 std::vector<RunTimePoolInfo> requestPoolInfos;
1080 requestPoolInfos.reserve(memories.size());
1081 for (const RuntimeMemory* mem : memories) {
1082 if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
1083 requestPoolInfos.emplace_back(*poolInfo);
1084 } else {
1085 return {ANEURALNETWORKS_UNMAPPABLE, {}, {}};
1086 }
1087 }
1088 // Create as many pools as there are input / output.
1089 auto fixPointerArguments =
1090 [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
1091 std::vector<DataLocation> ptrArgsLocations;
1092 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
1093 if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
1094 ptrArgsLocations.push_back(
1095 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
1096 .offset = 0,
1097 .length = argumentInfo.length(),
1098 .padding = argumentInfo.padding()});
1099 requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
1100 static_cast<uint8_t*>(argumentInfo.buffer())));
1101 }
1102 }
1103 return ptrArgsLocations;
1104 };
1105 const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
1106 const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
1107
1108 Request request;
1109 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
1110 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
1111 return {ANEURALNETWORKS_NO_ERROR, std::move(request), std::move(requestPoolInfos)};
1112 }
1113
1114 // Perform computation on NNAPI CPU reference implementation.
1115 //
1116 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
1117 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
1118 // there are input/output in this method to avoid data copying.
1119 //
1120 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const SharedBurst &,MeasureTiming,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration) const1121 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
1122 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1123 const std::vector<const RuntimeMemory*>& memories, const SharedBurst& /*burstController*/,
1124 MeasureTiming /*measure*/, const OptionalTimePoint& deadline,
1125 const OptionalDuration& loopTimeoutDuration) const {
1126 if (hasDeadlinePassed(deadline)) {
1127 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
1128 }
1129
1130 int nCreateRequest;
1131 Request request;
1132 std::vector<RunTimePoolInfo> requestPoolInfos;
1133 std::tie(nCreateRequest, request, requestPoolInfos) =
1134 createCpuRequest(inputs, outputs, memories);
1135 if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
1136 return {nCreateRequest, {}, {}};
1137 }
1138
1139 if (!DeviceManager::get()->syncExecCpu()) {
1140 // TODO: use a thread pool
1141 // TODO(mikie): this could have NNTRACE so we could measure the overhead
1142 // of spinning up a new thread.
1143 std::tuple<int, std::vector<OutputShape>, Timing> result = {};
1144 std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
1145 result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
1146 loopTimeoutDuration);
1147 }).join();
1148 return result;
1149 }
1150
1151 return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
1152 loopTimeoutDuration);
1153 }
1154
createReusableExecution(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,MeasureTiming,const OptionalDuration & loopTimeoutDuration) const1155 std::pair<int, std::shared_ptr<RuntimeExecution>> CpuPreparedModel::createReusableExecution(
1156 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1157 const std::vector<const RuntimeMemory*>& memories, MeasureTiming /*measure*/,
1158 const OptionalDuration& loopTimeoutDuration) const {
1159 auto [nCreateRequest, request, requestPoolInfos] = createCpuRequest(inputs, outputs, memories);
1160 if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
1161 return {nCreateRequest, nullptr};
1162 }
1163 auto execution = std::make_shared<CpuExecution>(
1164 *this, std::move(request), std::move(requestPoolInfos), loopTimeoutDuration);
1165 return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
1166 }
1167
compute(const SharedBurst &,const OptionalTimePoint & deadline) const1168 std::tuple<int, std::vector<OutputShape>, Timing> CpuExecution::compute(
1169 const SharedBurst& /*burstController*/, const OptionalTimePoint& deadline) const {
1170 if (hasDeadlinePassed(deadline)) {
1171 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
1172 }
1173
1174 if (!DeviceManager::get()->syncExecCpu()) {
1175 // TODO: use a thread pool
1176 // TODO(mikie): this could have NNTRACE so we could measure the overhead
1177 // of spinning up a new thread.
1178 std::tuple<int, std::vector<OutputShape>, Timing> result = {};
1179 std::thread([this, &deadline, &result] {
1180 result = computeOnCpu(kPreparedModel.getModel(), kRequest,
1181 kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline,
1182 kLoopTimeoutDuration);
1183 }).join();
1184 return result;
1185 }
1186
1187 return computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(),
1188 kRequestPoolInfos, deadline, kLoopTimeoutDuration);
1189 }
1190
computeFenced(const std::vector<int> & waitFor,const OptionalTimePoint & deadline,const OptionalDuration & duration) const1191 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuExecution::computeFenced(
1192 const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
1193 const OptionalDuration& duration) const {
1194 VLOG(EXECUTION)
1195 << "CpuExecution::computeFenced wait for sync fences to signal before execution";
1196 for (int syncFd : waitFor) {
1197 if (syncFd > 0) {
1198 auto r = syncWait(syncFd, -1);
1199 if (r != FenceState::SIGNALED) {
1200 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
1201 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
1202 }
1203 }
1204 }
1205
1206 // Update deadline if the timeout duration is closer than the deadline.
1207 auto closestDeadline = deadline;
1208 if (duration.has_value()) {
1209 const auto timeoutDurationDeadline = makeDeadline(*duration);
1210 if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
1211 closestDeadline = timeoutDurationDeadline;
1212 }
1213 }
1214
1215 const auto [result, outputShapes, timing] = compute(nullptr, closestDeadline);
1216 return {result, -1, nullptr, timing};
1217 }
1218
get()1219 DeviceManager* DeviceManager::get() {
1220 static DeviceManager manager;
1221 return &manager;
1222 }
1223
getCpuDevice()1224 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
1225 return CpuDevice::get();
1226 }
1227
forTest_makeDriverDevice(const SharedDevice & device)1228 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const SharedDevice& device) {
1229 VLOG(MANAGER) << "forTest_makeDriverDevice(" << device->getName() << ")";
1230 const auto driverDevice = DriverDevice::create(device);
1231 CHECK(driverDevice != nullptr);
1232 return driverDevice;
1233 }
1234
1235 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
getDriverDevices()1236 std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
1237 const auto& appInfo = AppInfoFetcher::get()->getAppInfo();
1238 const bool currentProcessIsOnThePlatform =
1239 appInfo.appIsSystemApp || appInfo.appIsOnVendorImage || appInfo.appIsOnProductImage;
1240
1241 const bool includeUpdatableDrivers = !currentProcessIsOnThePlatform;
1242 auto devicesAndUpdatability =
1243 hardware::neuralnetworks::service::getDevices(includeUpdatableDrivers);
1244
1245 std::vector<std::shared_ptr<DriverDevice>> driverDevices;
1246 driverDevices.reserve(devicesAndUpdatability.size());
1247 for (auto& [device, isDeviceUpdatable] : devicesAndUpdatability) {
1248 driverDevices.push_back(DriverDevice::create(std::move(device), isDeviceUpdatable));
1249 }
1250 return driverDevices;
1251 }
1252 #else
getDriverDevices()1253 std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
1254 auto devices = getDevices();
1255 std::vector<std::shared_ptr<DriverDevice>> driverDevices;
1256 driverDevices.reserve(devices.size());
1257 for (auto& device : devices) {
1258 driverDevices.push_back(DriverDevice::create(std::move(device)));
1259 }
1260 return driverDevices;
1261 }
1262 #endif // NN_COMPATIBILITY_LIBRARY_BUILD
1263
findAvailableDevices()1264 void DeviceManager::findAvailableDevices() {
1265 VLOG(MANAGER) << "findAvailableDevices";
1266
1267 // register driver devices
1268 auto driverDevices = getDriverDevices();
1269 for (auto& driverDevice : driverDevices) {
1270 VLOG(MANAGER) << "Found interface " << driverDevice->getName();
1271 mDevices.push_back(std::move(driverDevice));
1272 }
1273
1274 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
1275 // register CPU fallback device
1276 mDevices.push_back(CpuDevice::get());
1277 mDevicesCpuOnly.push_back(CpuDevice::get());
1278 #endif // NN_COMPATIBILITY_LIBRARY_BUILD
1279 }
1280
registerDevice(const SharedDevice & device)1281 void DeviceManager::registerDevice(const SharedDevice& device) {
1282 if (auto driverDevice = DriverDevice::create(device)) {
1283 mDevices.push_back(std::move(driverDevice));
1284 }
1285 }
1286
DeviceManager()1287 DeviceManager::DeviceManager() {
1288 VLOG(MANAGER) << "DeviceManager::DeviceManager";
1289 findAvailableDevices();
1290 #ifdef NN_DEBUGGABLE
1291 mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
1292 mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
1293 mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
1294 mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
1295 mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
1296 #endif // NN_DEBUGGABLE
1297 }
1298
1299 } // namespace nn
1300 } // namespace android
1301