• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Manager"
18 
19 #include "Manager.h"
20 
21 #include <CpuExecutor.h>
22 #include <LegacyUtils.h>
23 #include <MetaModel.h>
24 #include <Tracing.h>
25 #include <nnapi/IBurst.h>
26 #include <nnapi/IDevice.h>
27 #include <nnapi/IExecution.h>
28 #include <nnapi/IPreparedModel.h>
29 #include <nnapi/SharedMemory.h>
30 #include <nnapi/Types.h>
31 #include <nnapi/Validation.h>
32 
33 #include <algorithm>
34 #include <functional>
35 #include <iterator>
36 #include <map>
37 #include <memory>
38 #include <string>
39 #include <tuple>
40 #include <utility>
41 #include <vector>
42 
43 #include "ExecutionCallback.h"
44 #include "FeatureLevel.h"
45 #include "Memory.h"
46 #include "ModelArgumentInfo.h"
47 #include "TypeManager.h"
48 
49 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
50 #include <build/version.h>
51 #include <cutils/native_handle.h>
52 #include <nnapi/hal/1.3/Buffer.h>
53 #include <nnapi/hal/Service.h>
54 
55 #include "AppInfoFetcher.h"
56 #endif  // NN_COMPATIBILITY_LIBRARY_BUILD
57 
58 namespace android {
59 namespace nn {
60 
61 // A Device with actual underlying driver
62 class DriverDevice : public Device {
63    public:
64     // Create a DriverDevice from a name and a DeviceFactory function.
65     // Returns nullptr on failure.
66     static std::shared_ptr<DriverDevice> create(SharedDevice device, bool isUpdatable = false);
67 
68     // Prefer using DriverDevice::create
69     explicit DriverDevice(SharedDevice device, bool isUpdatable);
70 
getName() const71     const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const72     const std::string& getVersionString() const override { return kInterface->getVersionString(); }
73     int64_t getFeatureLevel() const override;
getType() const74     int32_t getType() const override { return static_cast<int32_t>(kInterface->getType()); }
isUpdatable() const75     bool isUpdatable() const override { return kIsUpdatable; }
getSupportedExtensions() const76     const std::vector<Extension>& getSupportedExtensions() const override {
77         return kInterface->getSupportedExtensions();
78     }
79     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getCapabilities() const80     const Capabilities& getCapabilities() const override { return kInterface->getCapabilities(); }
getPerformance(OperandType type) const81     Capabilities::PerformanceInfo getPerformance(OperandType type) const override {
82         return getCapabilities().operandPerformance.lookup(type);
83     }
getRelaxedFloat32toFloat16PerformanceScalar() const84     Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
85         return getCapabilities().relaxedFloat32toFloat16PerformanceScalar;
86     }
getRelaxedFloat32toFloat16PerformanceTensor() const87     Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
88         return getCapabilities().relaxedFloat32toFloat16PerformanceTensor;
89     }
getIfPerformance() const90     Capabilities::PerformanceInfo getIfPerformance() const override {
91         return getCapabilities().ifPerformance;
92     }
getWhilePerformance() const93     Capabilities::PerformanceInfo getWhilePerformance() const override {
94         return getCapabilities().whilePerformance;
95     }
getNumberOfCacheFilesNeeded() const96     std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
97         return kInterface->getNumberOfCacheFilesNeeded();
98     }
isCachingSupported() const99     bool isCachingSupported() const override {
100         // Caching is supported if either of numModelCache or numDataCache is greater than 0.
101         const auto [numModelCacheFiles, numDataCacheFiles] = getNumberOfCacheFilesNeeded();
102         return numModelCacheFiles > 0 || numDataCacheFiles > 0;
103     }
wait() const104     int wait() const override {
105         auto result = kInterface->wait();
106         if (!result.ok()) {
107             LOG(ERROR) << "DriverDevice::wait error: " << result.error().message;
108             return convertErrorStatusToResultCode(result.error().code);
109         }
110         return ANEURALNETWORKS_NO_ERROR;
111     }
112 
113     std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
114             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
115             const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
116             const std::optional<CacheToken>& maybeToken) const override;
117 
118     std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
119                                                             OperandType) const override;
120 
121    private:
122     const SharedDevice kInterface;
123     const bool kIsUpdatable;
124 
125     GeneralResult<std::vector<bool>> getSupportedOperationsImpl(const MetaModel& metaModel) const;
126     GeneralResult<SharedPreparedModel> prepareModelFromCacheInternal(
127             const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
128             const CacheToken& token) const;
129 
130 #ifdef NN_DEBUGGABLE
131     // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
132     // 0 - all operations reported by IDevice::getSupportedOperations() supported
133     // 1 - some operations reported by IDevice::getSupportedOperations() supported
134     uint32_t mSupported = 0;
135 #endif  // NN_DEBUGGABLE
136 };
137 
138 // A RuntimePreparedModel with underlying IPreparedModel instance return by actual driver.
139 class DriverPreparedModel : public RuntimePreparedModel {
140    public:
DriverPreparedModel(const Device * device,const SharedPreparedModel & preparedModel)141     DriverPreparedModel(const Device* device, const SharedPreparedModel& preparedModel)
142         : mDevice(device), mPreparedModel(preparedModel) {
143         CHECK(mDevice != nullptr);
144         CHECK(mPreparedModel != nullptr);
145     }
146 
getDevice() const147     const Device* getDevice() const override { return mDevice; }
getInterface() const148     SharedPreparedModel getInterface() const override { return mPreparedModel; }
149 
150     std::tuple<int, std::vector<OutputShape>, Timing> execute(
151             const std::vector<ModelArgumentInfo>& inputs,
152             const std::vector<ModelArgumentInfo>& outputs,
153             const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
154             MeasureTiming measure, const OptionalTimePoint& deadline,
155             const OptionalDuration& loopTimeoutDuration) const override;
156 
157     std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
158             const std::vector<ModelArgumentInfo>& inputs,
159             const std::vector<ModelArgumentInfo>& outputs,
160             const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
161             MeasureTiming measure, const OptionalTimePoint& deadline,
162             const OptionalDuration& loopTimeoutDuration,
163             const OptionalDuration& timeoutDurationAfterFence) const override;
164 
165     std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
166             const std::vector<ModelArgumentInfo>& inputs,
167             const std::vector<ModelArgumentInfo>& outputs,
168             const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
169             const OptionalDuration& loopTimeoutDuration) const override;
170 
configureExecutionBurst() const171     GeneralResult<SharedBurst> configureExecutionBurst() const override {
172         return mPreparedModel->configureExecutionBurst();
173     }
174 
getMemoryPreference() const175     MemoryPreference getMemoryPreference() const override {
176         if (mDevice->getFeatureLevel() >= ANEURALNETWORKS_FEATURE_LEVEL_5) {
177             return {kDefaultRequestMemoryAlignment, kDefaultRequestMemoryPadding};
178         } else {
179             // We are not able to pass memory padding information to HIDL drivers, so return the
180             // minimum padding.
181             return {kDefaultRequestMemoryAlignment, kMinMemoryPadding};
182         }
183     }
184 
185    private:
186     const Device* mDevice;
187     const SharedPreparedModel mPreparedModel;
188 };
189 
190 class DriverExecution : public RuntimeExecution {
191    public:
DriverExecution(SharedExecution execution,Request request,std::vector<const RuntimeMemory * > memories,MeasureTiming measure,OptionalDuration loopTimeoutDuration,int64_t deviceFeatureLevel)192     DriverExecution(SharedExecution execution, Request request,
193                     std::vector<const RuntimeMemory*> memories, MeasureTiming measure,
194                     OptionalDuration loopTimeoutDuration, int64_t deviceFeatureLevel)
195         : kExecution(std::move(execution)),
196           kRequest(std::move(request)),
197           kMemories(std::move(memories)),
198           kMeasure(measure),
199           kLoopTimeoutDuration(std::move(loopTimeoutDuration)),
200           kDeviceFeatureLevel(deviceFeatureLevel) {
201         CHECK(kExecution != nullptr);
202     }
203 
204     std::tuple<int, std::vector<OutputShape>, Timing> compute(
205             const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
206 
207     std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
208             const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
209             const OptionalDuration& timeoutDurationAfterFence) const override;
210 
211    private:
212     const SharedExecution kExecution;
213 
214     // For burst execution.
215     const Request kRequest;
216     const std::vector<const RuntimeMemory*> kMemories;
217     const MeasureTiming kMeasure;
218     const OptionalDuration kLoopTimeoutDuration;
219     mutable std::map<const IBurst*, SharedExecution> mCachedBurstExecutions;
220 
221     // For fenced execution.
222     const int64_t kDeviceFeatureLevel;
223 };
224 
DriverDevice(SharedDevice device,bool isUpdatable)225 DriverDevice::DriverDevice(SharedDevice device, bool isUpdatable)
226     : kInterface(std::move(device)), kIsUpdatable(isUpdatable) {
227     CHECK(kInterface != nullptr);
228 #ifdef NN_DEBUGGABLE
229     static const char samplePrefix[] = "sample";
230     if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
231         mSupported = getProp("debug.nn.sample.supported");
232     }
233 #endif  // NN_DEBUGGABLE
234 }
235 
create(SharedDevice device,bool isUpdatable)236 std::shared_ptr<DriverDevice> DriverDevice::create(SharedDevice device, bool isUpdatable) {
237     if (device == nullptr) {
238         LOG(ERROR) << "DriverDevice::create called with nullptr";
239         return nullptr;
240     }
241 
242     return std::make_shared<DriverDevice>(std::move(device), isUpdatable);
243 }
244 
getFeatureLevel() const245 int64_t DriverDevice::getFeatureLevel() const {
246     Version featureLevel = kInterface->getFeatureLevel();
247     switch (featureLevel) {
248         case Version::ANDROID_OC_MR1:
249             return ANEURALNETWORKS_FEATURE_LEVEL_1;
250         case Version::ANDROID_P:
251             return ANEURALNETWORKS_FEATURE_LEVEL_2;
252         case Version::ANDROID_Q:
253             return ANEURALNETWORKS_FEATURE_LEVEL_3;
254         case Version::ANDROID_R:
255             return ANEURALNETWORKS_FEATURE_LEVEL_4;
256         case Version::ANDROID_S:
257             return ANEURALNETWORKS_FEATURE_LEVEL_5;
258         case Version::CURRENT_RUNTIME:
259             break;
260     }
261     LOG(FATAL) << "Unsupported driver feature level: " << featureLevel;
262     return -1;
263 }
264 
getSupportedOperationsImpl(const MetaModel & metaModel) const265 GeneralResult<std::vector<bool>> DriverDevice::getSupportedOperationsImpl(
266         const MetaModel& metaModel) const {
267     const auto featureLevel = kInterface->getFeatureLevel();
268     const auto slice = metaModel.getSlice(featureLevel);
269     if (!slice.has_value()) {
270         return NN_ERROR() << "getSlice(" << featureLevel << ") failed";
271     }
272 
273     const auto& [sliceModel, slicedModelOperationIndexToModelOperationIndex] = *slice;
274     const std::vector<bool> supported = NN_TRY(kInterface->getSupportedOperations(sliceModel));
275     const uint32_t slicedOperationCount = sliceModel.main.operations.size();
276     if (supported.size() != slicedOperationCount) {
277         return NN_ERROR() << "IDevice::getSupportedOperations returned a vector of length "
278                           << supported.size() << " when expecting " << slicedOperationCount;
279     }
280 
281     const Model& model = metaModel.getModel();
282     const uint32_t operationCount = model.main.operations.size();
283     std::vector<bool> remappedSupported(operationCount, false);
284     for (size_t i = 0; i < supported.size(); ++i) {
285         if (supported[i]) {
286             remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true;
287         }
288     }
289     return remappedSupported;
290 }
291 
getSupportedOperations(const MetaModel & metaModel) const292 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
293     const Model& model = metaModel.getModel();
294 
295     auto result = getSupportedOperationsImpl(metaModel);
296     if (!result.ok()) {
297         LOG(ERROR) << "getSupportedOperations failed with code " << result.error().code << ": "
298                    << result.error().message;
299         // Set the supported operation vectors to all false, so we won't use this driver.
300         return std::vector<bool>(model.main.operations.size(), false);
301     }
302 
303     std::vector<bool>& supportedOperations = result.value();
304 #ifdef NN_DEBUGGABLE
305     if (mSupported != 1) {
306         return supportedOperations;
307     }
308 
309     const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
310     for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
311         if (!supportedOperations[operationIndex]) {
312             continue;
313         }
314 
315         uint32_t accumulator = baseAccumulator;
316         const Operation& operation = model.main.operations[operationIndex];
317         accumulator ^= static_cast<uint32_t>(operation.type);
318         auto accumulateOperands = [&model, &accumulator](const std::vector<uint32_t>& operands) {
319             for (uint32_t operandIndex : operands) {
320                 const Operand& operand = model.main.operands[operandIndex];
321                 accumulator ^= static_cast<uint32_t>(operand.type);
322                 accumulator ^= operand.dimensions.size();
323                 for (const Dimension& dimension : operand.dimensions) {
324                     accumulator ^= dimension;
325                     if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY ||
326                         operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE ||
327                         operand.lifetime == Operand::LifeTime::POINTER) {
328                         accumulator ^= 1;
329                     }
330                 }
331             }
332         };
333         accumulateOperands(operation.inputs);
334         accumulateOperands(operation.outputs);
335         if (accumulator & 1) {
336             supportedOperations[operationIndex] = false;
337         }
338     }
339 #endif  // NN_DEBUGGABLE
340 
341     return supportedOperations;
342 }
343 
344 // Opens a cache file for reading and writing and returns a shared handle.
createCacheHandle(const std::string & filename,bool createIfNotExist)345 static GeneralResult<SharedHandle> createCacheHandle(const std::string& filename,
346                                                      bool createIfNotExist) {
347     auto fd = base::unique_fd(open(filename.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR,
348                                    S_IRUSR | S_IWUSR));
349     if (fd.get() == -1) {
350         return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
351                << "Failed to " << (createIfNotExist ? "open or create" : "open") << " cache file "
352                << filename;
353     }
354     std::vector<base::unique_fd> fds;
355     fds.push_back(std::move(fd));
356     return std::make_shared<const Handle>(Handle{
357             .fds = std::move(fds),
358             .ints = {},
359     });
360 }
361 
362 // Opens a list of cache files and returns a vector of shared handles. The files
363 // are always opened with both read and write permissions.
createCacheHandleVec(uint32_t numCacheFiles,const std::string & baseFilename,bool createIfNotExist)364 static GeneralResult<std::vector<SharedHandle>> createCacheHandleVec(
365         uint32_t numCacheFiles, const std::string& baseFilename, bool createIfNotExist) {
366     CHECK(numCacheFiles <= kMaxNumberOfCacheFiles);
367     std::vector<SharedHandle> handles;
368     handles.reserve(numCacheFiles);
369     for (uint32_t i = 0; i < numCacheFiles; i++) {
370         std::string filename = baseFilename + std::to_string(i);
371         VLOG(COMPILATION) << "Cache " << i << ": " << filename;
372         handles.push_back(NN_TRY(createCacheHandle(filename, createIfNotExist)));
373     }
374     return handles;
375 }
376 
377 // Maps a token to cache file names and returns a pair of vectors of shared
378 // handles to the opened files.
getCacheHandles(const CacheInfo & cacheInfo,const CacheToken & token,const std::pair<uint32_t,uint32_t> & numCacheFiles,bool createIfNotExist)379 static GeneralResult<CacheHandles> getCacheHandles(
380         const CacheInfo& cacheInfo, const CacheToken& token,
381         const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist) {
382     if (const auto* cacheHandles = std::get_if<CacheHandles>(&cacheInfo.variant)) {
383         if (cacheHandles->modelCache.size() != numCacheFiles.first) {
384             return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
385                    << "Expected " << numCacheFiles.first << " model cache handles, got "
386                    << cacheHandles->modelCache.size();
387         }
388         if (cacheHandles->dataCache.size() != numCacheFiles.second) {
389             return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
390                    << "Expected " << numCacheFiles.second << " data cache handles, got "
391                    << cacheHandles->dataCache.size();
392         }
393         return *cacheHandles;
394     }
395 
396     // The filename includes kByteSizeOfCacheToken * 2 characters for token,
397     // and 1 character for model/data cache identifier.
398     std::string filename(kByteSizeOfCacheToken * 2 + 1, '0');
399     for (uint32_t i = 0; i < kByteSizeOfCacheToken; i++) {
400         filename[i * 2] = 'A' + (token[i] & 0x0F);
401         filename[i * 2 + 1] = 'A' + (token[i] >> 4);
402     }
403 
404     const auto& cacheDir = std::get<CacheDir>(cacheInfo.variant);
405     CHECK(cacheDir.empty() || cacheDir.back() == '/');
406     std::string cacheFileName = cacheDir + filename;
407     const uint32_t cacheTypeIdentifierIndex = cacheDir.size() + kByteSizeOfCacheToken * 2;
408 
409     cacheFileName[cacheTypeIdentifierIndex] = '1';
410     std::vector<SharedHandle> modelCache =
411             NN_TRY(createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist));
412 
413     cacheFileName[cacheTypeIdentifierIndex] = '2';
414     std::vector<SharedHandle> dataCache =
415             NN_TRY(createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist));
416 
417     return CacheHandles{
418             .modelCache = std::move(modelCache),
419             .dataCache = std::move(dataCache),
420     };
421 }
422 
prepareModelFromCacheInternal(const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,const CacheToken & token) const423 GeneralResult<SharedPreparedModel> DriverDevice::prepareModelFromCacheInternal(
424         const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
425         const CacheToken& token) const {
426     // Get cache files if they exist, otherwise return from the function early.
427     auto cache = NN_TRY(getCacheHandles(cacheInfo, token, kInterface->getNumberOfCacheFilesNeeded(),
428                                         /*createIfNotExist=*/false));
429     return kInterface->prepareModelFromCache(deadline, cache.modelCache, cache.dataCache, token);
430 }
431 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,const std::optional<CacheToken> & maybeToken) const432 std::pair<int, std::shared_ptr<RuntimePreparedModel>> DriverDevice::prepareModel(
433         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
434         const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
435         const std::optional<CacheToken>& maybeToken) const {
436     // Attempt to compile from cache if token is present.
437     if (maybeToken.has_value()) {
438         auto result = prepareModelFromCacheInternal(deadline, cacheInfo, *maybeToken);
439         if (result.has_value()) {
440             return {ANEURALNETWORKS_NO_ERROR,
441                     std::make_shared<DriverPreparedModel>(this, std::move(result).value())};
442         } else {
443             LOG(ERROR) << "prepareModelFromCache failure (" << result.error().code
444                        << "): " << result.error().message;
445         }
446     }
447 
448     // Get cache files if they exist, otherwise create them.
449     CacheHandles cache;
450     if (maybeToken.has_value()) {
451         auto result =
452                 getCacheHandles(cacheInfo, *maybeToken, kInterface->getNumberOfCacheFilesNeeded(),
453                                 /*createIfNotExist=*/true);
454         if (result.has_value()) {
455             cache = std::move(result).value();
456         } else {
457             LOG(ERROR) << "getCacheHandles failure (" << result.error().code
458                        << "): " << result.error().message;
459         }
460     }
461 
462     // Get the token if it exists, otherwise get a null token.
463     static constexpr CacheToken kNullToken = {};
464     const CacheToken token = maybeToken.value_or(kNullToken);
465 
466     // Fallback to full compilation (possibly with token) if
467     // prepareModelFromCache could not be used or failed.
468     const Model model = makeModel();
469     auto result = kInterface->prepareModel(model, preference, priority, deadline, cache.modelCache,
470                                            cache.dataCache, token);
471     if (!result.ok()) {
472         LOG(ERROR) << "IDevice::prepareModel() error: " << result.error().message;
473         return {convertErrorStatusToResultCode(result.error().code), nullptr};
474     }
475     SharedPreparedModel preparedModel = std::move(result).value();
476     CHECK(preparedModel != nullptr)
477             << "IDevice::prepareModel() returned nullptr without error code";
478     return {ANEURALNETWORKS_NO_ERROR,
479             std::make_shared<DriverPreparedModel>(this, std::move(preparedModel))};
480 }
481 
allocate(const MemoryDescriptor & desc,OperandType) const482 std::pair<int, std::unique_ptr<RuntimeMemory>> DriverDevice::allocate(const MemoryDescriptor& desc,
483                                                                       OperandType) const {
484     const BufferDesc bufferDesc = {.dimensions = desc.dimensions};
485     std::vector<SharedPreparedModel> preparedModels(desc.preparedModels.size());
486     std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
487                    [](const auto* preparedModel) {
488                        const auto versionedPreparedModel = preparedModel->getInterface();
489                        CHECK(versionedPreparedModel != nullptr);
490                        return versionedPreparedModel;
491                    });
492     auto result =
493             kInterface->allocate(bufferDesc, preparedModels, desc.inputRoles, desc.outputRoles);
494     if (!result.ok()) {
495         LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
496                    << " failed!";
497         return {convertErrorStatusToResultCode(result.error().code), nullptr};
498     }
499     return MemoryFromDevice::create(std::move(result).value());
500 }
501 
createDriverRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories)502 static Request createDriverRequest(const std::vector<ModelArgumentInfo>& inputs,
503                                    const std::vector<ModelArgumentInfo>& outputs,
504                                    const std::vector<const RuntimeMemory*>& memories) {
505     Request request;
506     request.inputs.reserve(inputs.size());
507     std::transform(inputs.begin(), inputs.end(), std::back_inserter(request.inputs),
508                    [](const auto& input) { return input.createRequestArgument(); });
509     request.outputs.reserve(outputs.size());
510     std::transform(outputs.begin(), outputs.end(), std::back_inserter(request.outputs),
511                    [](const auto& output) { return output.createRequestArgument(); });
512     request.pools.reserve(memories.size());
513     std::transform(memories.begin(), memories.end(), std::back_inserter(request.pools),
514                    [](const RuntimeMemory* memory) { return memory->getMemoryPool(); });
515     return request;
516 }
517 
518 // Perform computation on an actual device driver.
519 //
520 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
521 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
522 // execution, and the output pointer data will be copied out from the output pool after the
523 // execution.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const SharedBurst & burstController,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration) const524 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
525         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
526         const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
527         MeasureTiming measure, const OptionalTimePoint& deadline,
528         const OptionalDuration& loopTimeoutDuration) const {
529     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
530 
531     auto request = createDriverRequest(inputs, outputs, memories);
532 
533     NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::execute::execute");
534 
535     ExecutionResult<std::pair<std::vector<OutputShape>, Timing>> result;
536 
537     // compute using burst if present, otherwise compute from IPreparedModel
538     const bool burstCompute = (burstController != nullptr);
539     if (burstCompute) {
540         for (const RuntimeMemory* memory : memories) {
541             const auto pool = memory->getMemoryPool();
542             if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
543                 auto cacheHold = burstController->cacheMemory(*maybeMemory);
544                 memory->hold(cacheHold);
545             }
546         }
547 
548         VLOG(EXECUTION) << "Before burstController->execute() " << SHOW_IF_DEBUG(request);
549 
550         result = burstController->execute(request, measure, deadline, loopTimeoutDuration);
551     } else {
552         result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
553     }
554 
555     int n = ANEURALNETWORKS_OP_FAILED;
556     std::vector<OutputShape> outputShapes;
557     Timing timing;
558 
559     if (result.ok()) {
560         n = ANEURALNETWORKS_NO_ERROR;
561         std::tie(outputShapes, timing) = std::move(result).value();
562     } else {
563         auto [message, code, returnedOutputShapes] = std::move(result).error();
564         VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
565         LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
566                    << "::execute(...) error: " << message;
567         n = convertErrorStatusToResultCode(code);
568         if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
569             outputShapes = std::move(returnedOutputShapes);
570         }
571         return {n, std::move(outputShapes), timing};
572     }
573 
574     VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
575     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
576 }
577 
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const std::vector<int> & waitFor,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration,const OptionalDuration & timeoutDurationAfterFence) const578 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverPreparedModel::executeFenced(
579         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
580         const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
581         MeasureTiming measure, const OptionalTimePoint& deadline,
582         const OptionalDuration& loopTimeoutDuration,
583         const OptionalDuration& timeoutDurationAfterFence) const {
584     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
585     CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
586 
587     auto request = createDriverRequest(inputs, outputs, memories);
588 
589     NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::executeFenced");
590 
591     std::vector<SyncFence> waitForHandles;
592     waitForHandles.reserve(waitFor.size());
593     for (int fd : waitFor) {
594         int dupFd = dup(fd);
595         if (dupFd < 0) {
596             LOG(ERROR) << "Unable to dup the file descriptor";
597             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
598         }
599         waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
600     }
601 
602     SyncFence syncFence = SyncFence::createAsSignaled();
603     ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
604     Timing timing = {};
605     if (mDevice->getFeatureLevel() >= kHalVersionV1_3ToApi.featureLevel) {
606         auto result = mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
607                                                     loopTimeoutDuration, timeoutDurationAfterFence);
608         if (!result.ok()) {
609             LOG(ERROR) << "IPreparedModel::executeFenced() error: " << result.error().message;
610             VLOG(EXECUTION) << "**executeFenced failed**";
611             return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
612         }
613         std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
614     } else {
615         // Fallback to synchronous execution if executeFenced is not supported.
616         // First wait for all sync fences to be ready.
617         LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
618         for (const auto& fence : waitForHandles) {
619             if (!fence.hasFd() || fence.getFd() < 0) {
620                 return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
621             }
622             auto r = fence.syncWait({/* no timeout */});
623             if (r != SyncFence::FenceState::SIGNALED) {
624                 LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
625                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
626             }
627         }
628         auto result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
629         if (!result.ok()) {
630             LOG(ERROR) << "IPreparedModel::execute() error: " << result.error().message;
631             return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
632         }
633         std::tie(std::ignore, timing) = result.value();
634     }
635 
636     int syncFenceFd = -1;
637     if (syncFence.hasFd()) {
638         syncFenceFd = dup(syncFence.getFd());
639         if (syncFenceFd < 0) {
640             LOG(ERROR) << "Failed to dup the file descriptor";
641             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
642         }
643     }
644 
645     VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
646     return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
647 }
648 
createReusableExecution(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,MeasureTiming measure,const OptionalDuration & loopTimeoutDuration) const649 std::pair<int, std::shared_ptr<RuntimeExecution>> DriverPreparedModel::createReusableExecution(
650         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
651         const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
652         const OptionalDuration& loopTimeoutDuration) const {
653     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::createReusableExecution");
654 
655     auto request = createDriverRequest(inputs, outputs, memories);
656     auto result = mPreparedModel->createReusableExecution(request, measure, loopTimeoutDuration);
657     if (!result.ok()) {
658         LOG(ERROR) << "IPreparedModel::createReusableExecution() error: " << result.error().message;
659         const int n = convertErrorStatusToResultCode(result.error().code);
660         return {n, nullptr};
661     }
662     auto execution = std::make_shared<DriverExecution>(
663             std::move(result).value(), std::move(request), memories, measure, loopTimeoutDuration,
664             mDevice->getFeatureLevel());
665     return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
666 }
667 
compute(const SharedBurst & burstController,const OptionalTimePoint & deadline) const668 std::tuple<int, std::vector<OutputShape>, Timing> DriverExecution::compute(
669         const SharedBurst& burstController, const OptionalTimePoint& deadline) const {
670     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::compute");
671 
672     // compute using burst if present, otherwise compute from IPreparedModel
673     SharedExecution execution;
674     const bool burstCompute = (burstController != nullptr);
675     if (burstCompute) {
676         // create a reusable burst execution if the controller is not seen before
677         auto burstExecution = mCachedBurstExecutions.find(burstController.get());
678         if (burstExecution == mCachedBurstExecutions.end()) {
679             for (const RuntimeMemory* memory : kMemories) {
680                 const auto pool = memory->getMemoryPool();
681                 if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
682                     auto cacheHold = burstController->cacheMemory(*maybeMemory);
683                     memory->hold(cacheHold);
684                 }
685             }
686             auto createResult = burstController->createReusableExecution(kRequest, kMeasure,
687                                                                          kLoopTimeoutDuration);
688             if (!createResult.ok()) {
689                 LOG(ERROR) << "IBurst::createReusableExecution() error: "
690                            << createResult.error().message;
691                 const int n = convertErrorStatusToResultCode(createResult.error().code);
692                 return {n, {}, {}};
693             }
694             execution = std::move(createResult).value();
695             mCachedBurstExecutions.emplace(burstController.get(), execution);
696         } else {
697             execution = burstExecution->second;
698         }
699         VLOG(EXECUTION) << "Before mBurstExecution->compute() " << SHOW_IF_DEBUG(kRequest);
700     } else {
701         execution = kExecution;
702     }
703 
704     CHECK(execution != nullptr);
705     auto result = execution->compute(deadline);
706     if (!result.ok()) {
707         auto [message, code, returnedOutputShapes] = std::move(result).error();
708         int n = convertErrorStatusToResultCode(code);
709         VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
710         LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
711                    << "::execute(...) error: " << message;
712         if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
713             return {n, std::move(returnedOutputShapes), {}};
714         }
715         return {n, {}, {}};
716     }
717 
718     VLOG(EXECUTION) << "DriverExecution::compute completed";
719     auto [outputShapes, timing] = std::move(result).value();
720     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
721 }
722 
computeFenced(const std::vector<int> & waitFor,const OptionalTimePoint & deadline,const OptionalDuration & timeoutDurationAfterFence) const723 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverExecution::computeFenced(
724         const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
725         const OptionalDuration& timeoutDurationAfterFence) const {
726     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::computeFenced");
727     CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
728 
729     std::vector<SyncFence> waitForHandles;
730     waitForHandles.reserve(waitFor.size());
731     for (int fd : waitFor) {
732         int dupFd = dup(fd);
733         if (dupFd < 0) {
734             LOG(ERROR) << "Unable to dup the file descriptor";
735             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
736         }
737         waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
738     }
739 
740     SyncFence syncFence = SyncFence::createAsSignaled();
741     ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
742     Timing timing = {};
743     if (kDeviceFeatureLevel >= kHalVersionV1_3ToApi.featureLevel) {
744         auto result =
745                 kExecution->computeFenced(waitForHandles, deadline, timeoutDurationAfterFence);
746         if (!result.ok()) {
747             LOG(ERROR) << "IExecution::computeFenced() error: " << result.error().message;
748             VLOG(EXECUTION) << "**computeFenced failed**";
749             return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
750         }
751         std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
752     } else {
753         // Fallback to synchronous execution if computeFenced is not supported.
754         // First wait for all sync fences to be ready.
755         LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
756         for (const auto& fence : waitForHandles) {
757             if (!fence.hasFd() || fence.getFd() < 0) {
758                 return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
759             }
760             auto r = fence.syncWait({/* no timeout */});
761             if (r != SyncFence::FenceState::SIGNALED) {
762                 LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
763                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
764             }
765         }
766         auto result = kExecution->compute(deadline);
767         if (!result.ok()) {
768             LOG(ERROR) << "IExecution::compute() error: " << result.error().message;
769             return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
770         }
771         std::tie(std::ignore, timing) = result.value();
772     }
773 
774     int syncFenceFd = -1;
775     if (syncFence.hasFd()) {
776         syncFenceFd = dup(syncFence.getFd());
777         if (syncFenceFd < 0) {
778             LOG(ERROR) << "Failed to dup the file descriptor";
779             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
780         }
781     }
782 
783     VLOG(EXECUTION) << "DriverExecution::computeFenced completed";
784     return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
785 }
786 
createCpuCapabilities()787 static Capabilities createCpuCapabilities() {
788     constexpr Capabilities::PerformanceInfo kPerf = {.execTime = 1.0f, .powerUsage = 1.0f};
789     constexpr OperandType operandTypes[] = {
790             OperandType::FLOAT32,
791             OperandType::INT32,
792             OperandType::UINT32,
793             OperandType::TENSOR_FLOAT32,
794             OperandType::TENSOR_INT32,
795             OperandType::TENSOR_QUANT8_ASYMM,
796             OperandType::BOOL,
797             OperandType::TENSOR_QUANT16_SYMM,
798             OperandType::TENSOR_FLOAT16,
799             OperandType::TENSOR_BOOL8,
800             OperandType::FLOAT16,
801             OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL,
802             OperandType::TENSOR_QUANT16_ASYMM,
803             OperandType::TENSOR_QUANT8_SYMM,
804             OperandType::TENSOR_QUANT8_ASYMM_SIGNED,
805     };
806 
807     std::vector<Capabilities::OperandPerformance> operandPerformance;
808     operandPerformance.reserve(std::size(operandTypes));
809     std::transform(std::begin(operandTypes), std::end(operandTypes),
810                    std::back_inserter(operandPerformance), [kPerf](OperandType type) {
811                        return Capabilities::OperandPerformance{.type = type, .info = kPerf};
812                    });
813 
814     auto table =
815             Capabilities::OperandPerformanceTable::create(std::move(operandPerformance)).value();
816 
817     return Capabilities{
818             .relaxedFloat32toFloat16PerformanceScalar = kPerf,
819             .relaxedFloat32toFloat16PerformanceTensor = kPerf,
820             .operandPerformance = std::move(table),
821             .ifPerformance = kPerf,
822             .whilePerformance = kPerf,
823     };
824 }
825 
826 // A special abstracted device for the CPU. Only one instance of this class will exist.
827 // Use get() to retrieve it.
828 class CpuDevice : public Device {
829    public:
830     // Returns the singleton CPU fallback device.
get()831     static std::shared_ptr<CpuDevice> get() {
832         static std::shared_ptr<CpuDevice> instance(new CpuDevice);
833         return instance;
834     }
835 
getName() const836     const std::string& getName() const override { return kName; }
getVersionString() const837     const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const838     int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const839     int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
isUpdatable() const840     bool isUpdatable() const override { return false; }
getSupportedExtensions() const841     const std::vector<Extension>& getSupportedExtensions() const override {
842         return kSupportedExtensions;
843     }
844     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getCapabilities() const845     const Capabilities& getCapabilities() const override { return kCapabilities; }
getPerformance(OperandType) const846     Capabilities::PerformanceInfo getPerformance(OperandType) const override {
847         return kPerformance;
848     }
getRelaxedFloat32toFloat16PerformanceScalar() const849     Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
850         return kPerformance;
851     }
getRelaxedFloat32toFloat16PerformanceTensor() const852     Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
853         return kPerformance;
854     }
getIfPerformance() const855     Capabilities::PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const856     Capabilities::PerformanceInfo getWhilePerformance() const override { return kPerformance; }
getNumberOfCacheFilesNeeded() const857     std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
858         return {/*numModelCache=*/0, /*numDataCache=*/0};
859     }
isCachingSupported() const860     bool isCachingSupported() const override { return false; }
wait() const861     int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
862 
863     std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
864             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
865             const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
866             const std::optional<CacheToken>& maybeToken) const override;
867 
868     std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
869                                                             OperandType type) const override;
870 
871    private:
872     CpuDevice() = default;
873     const int64_t kFeatureLevel = kCurrentNNAPIRuntimeFeatureLevel;
874     const std::string kName = "nnapi-reference";
875 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
876     const std::string kVersionString = build::GetBuildNumber();
877 #else
878     const std::string kVersionString = "UNKNOWN";
879 #endif  // NN_COMPATIBILITY_LIBRARY_BUILD
880     // Since the performance is a ratio compared to the CPU performance,
881     // by definition the performance of the CPU is 1.0.
882     const Capabilities::PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
883     const Capabilities kCapabilities = createCpuCapabilities();
884     const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
885 };
886 
887 // A special abstracted RuntimePreparedModel for the CPU, constructed by CpuDevice.
888 class CpuPreparedModel : public RuntimePreparedModel {
889    public:
890     // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
891     // a prepared model object if successfully created. Returns an error code
892     // and nullptr otherwise.
893     static std::pair<int, std::shared_ptr<RuntimePreparedModel>> create(Model model);
894 
getDevice() const895     const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const896     SharedPreparedModel getInterface() const override { return nullptr; }
897 
898     std::tuple<int, std::vector<OutputShape>, Timing> execute(
899             const std::vector<ModelArgumentInfo>& inputs,
900             const std::vector<ModelArgumentInfo>& outputs,
901             const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
902             MeasureTiming measure, const OptionalTimePoint& deadline,
903             const OptionalDuration& loopTimeoutDuration) const override;
904 
configureExecutionBurst() const905     GeneralResult<SharedBurst> configureExecutionBurst() const override { return nullptr; }
906 
907     std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
908             const std::vector<ModelArgumentInfo>& inputs,
909             const std::vector<ModelArgumentInfo>& outputs,
910             const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
911             MeasureTiming measure, const OptionalTimePoint& deadline,
912             const OptionalDuration& loopTimeoutDuration,
913             const OptionalDuration& timeoutDurationAfterFence) const override;
914 
915     std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
916             const std::vector<ModelArgumentInfo>& inputs,
917             const std::vector<ModelArgumentInfo>& outputs,
918             const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
919             const OptionalDuration& loopTimeoutDuration) const override;
920 
getMemoryPreference() const921     MemoryPreference getMemoryPreference() const override {
922         return {kPreferredAlignment, kPreferredPadding};
923     }
924 
925     // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)926     CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
927         : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
928 
getModel() const929     const Model& getModel() const { return mModel; }
getModelPoolInfos() const930     const std::vector<RunTimePoolInfo>& getModelPoolInfos() const { return mModelPoolInfos; }
931 
932    private:
933     // TFLite kernels prefers 64 bytes for padding and alignment.
934     static constexpr uint32_t kPreferredAlignment = 64;
935     static constexpr uint32_t kPreferredPadding = 64;
936 
937     const Model mModel;
938     const std::vector<RunTimePoolInfo> mModelPoolInfos;
939 };
940 
941 class CpuExecution : public RuntimeExecution {
942    public:
CpuExecution(const CpuPreparedModel & preparedModel,Request request,std::vector<RunTimePoolInfo> requestPoolInfos,OptionalDuration loopTimeoutDuration)943     CpuExecution(const CpuPreparedModel& preparedModel, Request request,
944                  std::vector<RunTimePoolInfo> requestPoolInfos,
945                  OptionalDuration loopTimeoutDuration)
946         : kPreparedModel(preparedModel),
947           kRequest(std::move(request)),
948           kRequestPoolInfos(std::move(requestPoolInfos)),
949           kLoopTimeoutDuration(std::move(loopTimeoutDuration)) {}
950 
951     std::tuple<int, std::vector<OutputShape>, Timing> compute(
952             const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
953 
954     std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
955             const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
956             const OptionalDuration& timeoutDurationAfterFence) const override;
957 
958    private:
959     const CpuPreparedModel& kPreparedModel;
960     Request kRequest;
961     std::vector<RunTimePoolInfo> kRequestPoolInfos;
962     const OptionalDuration kLoopTimeoutDuration;
963 };
964 
getSupportedOperations(const MetaModel & metaModel) const965 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
966     const Model& model = metaModel.getModel();
967     const size_t count = model.main.operations.size();
968     std::vector<bool> result(count, false);
969     for (size_t i = 0; i < count; i++) {
970         // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
971         //                    We may want to use the slicer for CpuDevice just as we do for
972         //                    DriverDevice.
973         OperationType operationType = model.main.operations[i].type;
974         result[i] = !isExtension(operationType) && operationType != OperationType::OEM_OPERATION;
975     }
976     return result;
977 }
978 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const OptionalTimePoint & deadline,const CacheInfo &,const std::optional<CacheToken> & maybeToken) const979 std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuDevice::prepareModel(
980         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
981         const OptionalTimePoint& deadline, const CacheInfo& /*cacheInfo*/,
982         const std::optional<CacheToken>& maybeToken) const {
983     CHECK(!maybeToken.has_value())
984             << "Should never call prepareModel with cache information on CpuDevice";
985 
986     const Model model = makeModel();
987     if (auto result = validate(model); !result.ok()) {
988         LOG(ERROR) << "Invalid Model: " << result.error();
989         return {ANEURALNETWORKS_OP_FAILED, nullptr};
990     }
991     if (auto result = validate(preference); !result.ok()) {
992         LOG(ERROR) << "Invalid ExecutionPreference: " << result.error();
993         return {ANEURALNETWORKS_OP_FAILED, nullptr};
994     }
995     if (auto result = validate(priority); !result.ok()) {
996         LOG(ERROR) << "Invalid Priority: " << result.error();
997         return {ANEURALNETWORKS_OP_FAILED, nullptr};
998     }
999     if (hasDeadlinePassed(deadline)) {
1000         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
1001     }
1002 
1003     return CpuPreparedModel::create(model);
1004 }
1005 
allocate(const MemoryDescriptor & desc,OperandType type) const1006 std::pair<int, std::unique_ptr<RuntimeMemory>> CpuDevice::allocate(const MemoryDescriptor& desc,
1007                                                                    OperandType type) const {
1008     uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
1009     if (size == 0) {
1010         LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
1011         return {ANEURALNETWORKS_OP_FAILED, nullptr};
1012     }
1013     return MemoryAshmem::create(size);
1014 }
1015 
create(Model model)1016 std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuPreparedModel::create(Model model) {
1017     std::vector<RunTimePoolInfo> poolInfos;
1018     if (!setRunTimePoolInfosFromCanonicalMemories(&poolInfos, model.pools)) {
1019         return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
1020     }
1021 
1022     std::shared_ptr<RuntimePreparedModel> preparedModel =
1023             std::make_shared<CpuPreparedModel>(std::move(model), std::move(poolInfos));
1024     return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
1025 }
1026 
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration)1027 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
1028         const Model& model, const Request& request,
1029         const std::vector<RunTimePoolInfo>& modelPoolInfos,
1030         const std::vector<RunTimePoolInfo>& requestPoolInfos, const OptionalTimePoint& deadline,
1031         const OptionalDuration& loopTimeoutDuration) {
1032     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
1033     CpuExecutor executor;
1034     if (loopTimeoutDuration.has_value()) {
1035         executor.setLoopTimeout(loopTimeoutDuration->count());
1036     }
1037     if (deadline.has_value()) {
1038         executor.setDeadline(*deadline);
1039     }
1040     int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
1041     const auto& outputShapes = executor.getOutputShapes();
1042     return {err, outputShapes, {}};
1043 }
1044 
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const std::vector<int> & waitFor,MeasureTiming measure,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration,const OptionalDuration & duration) const1045 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuPreparedModel::executeFenced(
1046         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1047         const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
1048         MeasureTiming measure, const OptionalTimePoint& deadline,
1049         const OptionalDuration& loopTimeoutDuration, const OptionalDuration& duration) const {
1050     VLOG(EXECUTION)
1051             << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
1052     for (int syncFd : waitFor) {
1053         if (syncFd > 0) {
1054             auto r = syncWait(syncFd, -1);
1055             if (r != FenceState::SIGNALED) {
1056                 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
1057                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
1058             }
1059         }
1060     }
1061 
1062     // Update deadline if the timeout duration is closer than the deadline.
1063     auto closestDeadline = deadline;
1064     if (duration.has_value()) {
1065         const auto timeoutDurationDeadline = makeDeadline(*duration);
1066         if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
1067             closestDeadline = timeoutDurationDeadline;
1068         }
1069     }
1070 
1071     const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
1072                                                         closestDeadline, loopTimeoutDuration);
1073     return {result, -1, nullptr, timing};
1074 }
1075 
createCpuRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories)1076 static std::tuple<int, Request, std::vector<RunTimePoolInfo>> createCpuRequest(
1077         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1078         const std::vector<const RuntimeMemory*>& memories) {
1079     std::vector<RunTimePoolInfo> requestPoolInfos;
1080     requestPoolInfos.reserve(memories.size());
1081     for (const RuntimeMemory* mem : memories) {
1082         if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
1083             requestPoolInfos.emplace_back(*poolInfo);
1084         } else {
1085             return {ANEURALNETWORKS_UNMAPPABLE, {}, {}};
1086         }
1087     }
1088     // Create as many pools as there are input / output.
1089     auto fixPointerArguments =
1090             [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
1091                 std::vector<DataLocation> ptrArgsLocations;
1092                 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
1093                     if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
1094                         ptrArgsLocations.push_back(
1095                                 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
1096                                  .offset = 0,
1097                                  .length = argumentInfo.length(),
1098                                  .padding = argumentInfo.padding()});
1099                         requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
1100                                 static_cast<uint8_t*>(argumentInfo.buffer())));
1101                     }
1102                 }
1103                 return ptrArgsLocations;
1104             };
1105     const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
1106     const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
1107 
1108     Request request;
1109     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
1110     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
1111     return {ANEURALNETWORKS_NO_ERROR, std::move(request), std::move(requestPoolInfos)};
1112 }
1113 
1114 // Perform computation on NNAPI CPU reference implementation.
1115 //
1116 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
1117 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
1118 // there are input/output in this method to avoid data copying.
1119 //
1120 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,const SharedBurst &,MeasureTiming,const OptionalTimePoint & deadline,const OptionalDuration & loopTimeoutDuration) const1121 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
1122         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1123         const std::vector<const RuntimeMemory*>& memories, const SharedBurst& /*burstController*/,
1124         MeasureTiming /*measure*/, const OptionalTimePoint& deadline,
1125         const OptionalDuration& loopTimeoutDuration) const {
1126     if (hasDeadlinePassed(deadline)) {
1127         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
1128     }
1129 
1130     int nCreateRequest;
1131     Request request;
1132     std::vector<RunTimePoolInfo> requestPoolInfos;
1133     std::tie(nCreateRequest, request, requestPoolInfos) =
1134             createCpuRequest(inputs, outputs, memories);
1135     if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
1136         return {nCreateRequest, {}, {}};
1137     }
1138 
1139     if (!DeviceManager::get()->syncExecCpu()) {
1140         // TODO: use a thread pool
1141         // TODO(mikie): this could have NNTRACE so we could measure the overhead
1142         //              of spinning up a new thread.
1143         std::tuple<int, std::vector<OutputShape>, Timing> result = {};
1144         std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
1145             result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
1146                                   loopTimeoutDuration);
1147         }).join();
1148         return result;
1149     }
1150 
1151     return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
1152                         loopTimeoutDuration);
1153 }
1154 
createReusableExecution(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const RuntimeMemory * > & memories,MeasureTiming,const OptionalDuration & loopTimeoutDuration) const1155 std::pair<int, std::shared_ptr<RuntimeExecution>> CpuPreparedModel::createReusableExecution(
1156         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
1157         const std::vector<const RuntimeMemory*>& memories, MeasureTiming /*measure*/,
1158         const OptionalDuration& loopTimeoutDuration) const {
1159     auto [nCreateRequest, request, requestPoolInfos] = createCpuRequest(inputs, outputs, memories);
1160     if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
1161         return {nCreateRequest, nullptr};
1162     }
1163     auto execution = std::make_shared<CpuExecution>(
1164             *this, std::move(request), std::move(requestPoolInfos), loopTimeoutDuration);
1165     return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
1166 }
1167 
compute(const SharedBurst &,const OptionalTimePoint & deadline) const1168 std::tuple<int, std::vector<OutputShape>, Timing> CpuExecution::compute(
1169         const SharedBurst& /*burstController*/, const OptionalTimePoint& deadline) const {
1170     if (hasDeadlinePassed(deadline)) {
1171         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
1172     }
1173 
1174     if (!DeviceManager::get()->syncExecCpu()) {
1175         // TODO: use a thread pool
1176         // TODO(mikie): this could have NNTRACE so we could measure the overhead
1177         //              of spinning up a new thread.
1178         std::tuple<int, std::vector<OutputShape>, Timing> result = {};
1179         std::thread([this, &deadline, &result] {
1180             result = computeOnCpu(kPreparedModel.getModel(), kRequest,
1181                                   kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline,
1182                                   kLoopTimeoutDuration);
1183         }).join();
1184         return result;
1185     }
1186 
1187     return computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(),
1188                         kRequestPoolInfos, deadline, kLoopTimeoutDuration);
1189 }
1190 
computeFenced(const std::vector<int> & waitFor,const OptionalTimePoint & deadline,const OptionalDuration & duration) const1191 std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuExecution::computeFenced(
1192         const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
1193         const OptionalDuration& duration) const {
1194     VLOG(EXECUTION)
1195             << "CpuExecution::computeFenced wait for sync fences to signal before execution";
1196     for (int syncFd : waitFor) {
1197         if (syncFd > 0) {
1198             auto r = syncWait(syncFd, -1);
1199             if (r != FenceState::SIGNALED) {
1200                 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
1201                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
1202             }
1203         }
1204     }
1205 
1206     // Update deadline if the timeout duration is closer than the deadline.
1207     auto closestDeadline = deadline;
1208     if (duration.has_value()) {
1209         const auto timeoutDurationDeadline = makeDeadline(*duration);
1210         if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
1211             closestDeadline = timeoutDurationDeadline;
1212         }
1213     }
1214 
1215     const auto [result, outputShapes, timing] = compute(nullptr, closestDeadline);
1216     return {result, -1, nullptr, timing};
1217 }
1218 
get()1219 DeviceManager* DeviceManager::get() {
1220     static DeviceManager manager;
1221     return &manager;
1222 }
1223 
getCpuDevice()1224 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
1225     return CpuDevice::get();
1226 }
1227 
forTest_makeDriverDevice(const SharedDevice & device)1228 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const SharedDevice& device) {
1229     VLOG(MANAGER) << "forTest_makeDriverDevice(" << device->getName() << ")";
1230     const auto driverDevice = DriverDevice::create(device);
1231     CHECK(driverDevice != nullptr);
1232     return driverDevice;
1233 }
1234 
1235 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
getDriverDevices()1236 std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
1237     const auto& appInfo = AppInfoFetcher::get()->getAppInfo();
1238     const bool currentProcessIsOnThePlatform =
1239             appInfo.appIsSystemApp || appInfo.appIsOnVendorImage || appInfo.appIsOnProductImage;
1240 
1241     const bool includeUpdatableDrivers = !currentProcessIsOnThePlatform;
1242     auto devicesAndUpdatability =
1243             hardware::neuralnetworks::service::getDevices(includeUpdatableDrivers);
1244 
1245     std::vector<std::shared_ptr<DriverDevice>> driverDevices;
1246     driverDevices.reserve(devicesAndUpdatability.size());
1247     for (auto& [device, isDeviceUpdatable] : devicesAndUpdatability) {
1248         driverDevices.push_back(DriverDevice::create(std::move(device), isDeviceUpdatable));
1249     }
1250     return driverDevices;
1251 }
1252 #else
getDriverDevices()1253 std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
1254     auto devices = getDevices();
1255     std::vector<std::shared_ptr<DriverDevice>> driverDevices;
1256     driverDevices.reserve(devices.size());
1257     for (auto& device : devices) {
1258         driverDevices.push_back(DriverDevice::create(std::move(device)));
1259     }
1260     return driverDevices;
1261 }
1262 #endif  // NN_COMPATIBILITY_LIBRARY_BUILD
1263 
findAvailableDevices()1264 void DeviceManager::findAvailableDevices() {
1265     VLOG(MANAGER) << "findAvailableDevices";
1266 
1267     // register driver devices
1268     auto driverDevices = getDriverDevices();
1269     for (auto& driverDevice : driverDevices) {
1270         VLOG(MANAGER) << "Found interface " << driverDevice->getName();
1271         mDevices.push_back(std::move(driverDevice));
1272     }
1273 
1274 #ifndef NN_COMPATIBILITY_LIBRARY_BUILD
1275     // register CPU fallback device
1276     mDevices.push_back(CpuDevice::get());
1277     mDevicesCpuOnly.push_back(CpuDevice::get());
1278 #endif  // NN_COMPATIBILITY_LIBRARY_BUILD
1279 }
1280 
registerDevice(const SharedDevice & device)1281 void DeviceManager::registerDevice(const SharedDevice& device) {
1282     if (auto driverDevice = DriverDevice::create(device)) {
1283         mDevices.push_back(std::move(driverDevice));
1284     }
1285 }
1286 
DeviceManager()1287 DeviceManager::DeviceManager() {
1288     VLOG(MANAGER) << "DeviceManager::DeviceManager";
1289     findAvailableDevices();
1290 #ifdef NN_DEBUGGABLE
1291     mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
1292     mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
1293     mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
1294     mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
1295     mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
1296 #endif  // NN_DEBUGGABLE
1297 }
1298 
1299 }  // namespace nn
1300 }  // namespace android
1301