• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <android-base/logging.h>
18 #include <android-base/unique_fd.h>
19 #include <android/hardware_buffer.h>
20 #include <gtest/gtest.h>
21 #include <vulkan/vulkan.h>
22 #include <vulkan/vulkan_android.h>
23 
24 #include <algorithm>
25 #include <cmath>
26 #include <cstring>
27 #include <memory>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 #include "TestNeuralNetworksWrapper.h"
33 
34 #ifndef NNTEST_ONLY_PUBLIC_API
35 #include "Manager.h"
36 #endif
37 
38 namespace android::nn {
39 namespace {
40 
41 using Type = test_wrapper::Type;
42 using OperandType = test_wrapper::OperandType;
43 using Result = test_wrapper::Result;
44 
45 constexpr uint32_t kOperandSizeX = 256;
46 constexpr uint32_t kOperandSizeY = 256;
47 constexpr uint32_t kOperandLength = kOperandSizeX * kOperandSizeY;
48 constexpr uint32_t kNumberOfIterationsToTest = 100;
49 constexpr uint32_t kMaxNumberOfPrintedErrors = 10;
50 
51 // This file implements a test suite that exercises a GPU -> NNAPI pipeline using AHardwareBuffer
52 // and sync fence. One pass of the pipeline involves the following three stages:
53 //
54 //   - GPU: Invoke the compute shader to clear the all elements in the output buffer to value "1"
55 //          of the corresponding element type. Because GPU may not be able to natively support
56 //          float16/int8/uint8 data types, we pack each data type into a 4-byte chunk as uint32_t
57 //          and pass to the shader. E.g., float16 will be packed as 0x3c003c00 -- float16 value
58 //          of "1" (0x3c00) repeated twice. The compute shader will use this 4-byte chunk to clear
59 //          the data in the output buffer (see CLEAR_DATA in the compute shader code).
60 //
61 //          The GPU workload will output directly to an AHardwareBuffer and export an Android sync
62 //          fence.
63 //
64 //   - NNAPI: Execute a broadcast ADD operation
65 //
66 //                output = ADD(input, const, act)
67 //
68 //            where "input" and "output" are of size [kOperandSizeY, kOperandSizeX], "const" and
69 //            "act" are model constant operands, "const" is of size [1] and value "1" of the
70 //            corresponding element type, "act" = 0. The ADD operation will increment each element
71 //            in the input tensor by 1.
72 //
73 //            The NNAPI executor takes the GPU output AHardwareBuffer as its input memory,
74 //            and directly outputs to another AHardwareBuffer. We use startComputeWithDependencies
75 //            to wait on the sync fence from the GPU workload. If supported, the NNAPI executor will
76 //            emit a sync fence; Otherwise, it will wait until the workload is finished.
77 //
78 //   - Check: Verify that each element in the resulting tensor is 1 + 1 = 2.
79 //
80 // We use introspection API to run the pipeline with each individual driver. Because this test is
81 // added in NNAPI feature level 5, we will exclude devices with a lower feature level. We expect
82 // that if the driver successfully prepares the model, it should finish execution without an error.
83 //
84 // The pipeline is tested with four data types: float32, float16, quant8_asymm, and
85 // quant8_asymm_signed. These data types are chosen to make sure that a driver is likely to
86 // support at least one of the data types.
87 //
88 // For each configuration, we run the pipeline for kNumberOfIterationsToTest iterations.
89 
90 const std::vector<uint32_t> kComputeShader =
91 #include "shaders/TestGpuNnapi.comp.spv.inl"
92         ;
93 
94 // The expected element value in the final NNAPI output AHardwareBuffer.
95 constexpr uint32_t kExpectedResultInInt = 2;
96 
97 // Helper templates for information related to a primary tensor data type. Only four specializations
98 // exists for this template: Type::TENSOR_FLOAT32, Type::TENSOR_FLOAT16, Type::TENSOR_QUANT8_ASYMM,
99 // and Type::TENSOR_QUANT8_ASYMM_SIGNED. Each specialization corresponds to a primary data type for
100 // the testing pipeline.
101 //
102 // Each template specialization defines the following fields:
103 //   - ElementType: The corresponding C++ type. Use sizeof(ElementType) to get the element size.
104 //   - kIsQuantized: Whether the data type is a quantized type or not.
105 //   - kClearData: The CLEAR_DATA used in the compute shader.
106 template <Type dataType>
107 struct TestTypeHelper;
108 template <>
109 struct TestTypeHelper<Type::TENSOR_FLOAT32> {
110     using ElementType = float;
111     static constexpr bool kIsQuantized = false;
112     // One float32 of value (1.0) packed into uint32_t
113     static constexpr uint32_t kClearData = 0x3f800000;
114 };
115 template <>
116 struct TestTypeHelper<Type::TENSOR_FLOAT16> {
117     using ElementType = _Float16;
118     static constexpr bool kIsQuantized = false;
119     // Two float16 of value (1.0) packed into uint32_t
120     static constexpr uint32_t kClearData = 0x3c003c00;
121 };
122 template <>
123 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM> {
124     using ElementType = uint8_t;
125     static constexpr bool kIsQuantized = true;
126     // Four uint8_t of value (1) packed into uint32_t
127     static constexpr uint32_t kClearData = 0x01010101;
128 };
129 template <>
130 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM_SIGNED> {
131     using ElementType = int8_t;
132     static constexpr bool kIsQuantized = true;
133     // Four int8_t of value (1) packed into uint32_t
134     static constexpr uint32_t kClearData = 0x01010101;
135 };
136 
isExtensionSupported(const std::vector<VkExtensionProperties> & supportedExtensions,const char * requestedExtension)137 bool isExtensionSupported(const std::vector<VkExtensionProperties>& supportedExtensions,
138                           const char* requestedExtension) {
139     return std::any_of(supportedExtensions.begin(), supportedExtensions.end(),
140                        [requestedExtension](const auto& extension) {
141                            return strcmp(extension.extensionName, requestedExtension) == 0;
142                        });
143 }
144 
145 // Records the workgroup size and the group counts of dispatching the compute shader.
146 struct DispatchSize {
147     uint32_t workgroupSize;
148     uint32_t groupCountX;
149     uint32_t groupCountY;
150 };
151 
152 // Choose an appropriate dispatch size. We are using a square workgroup size.
153 template <Type dataType>
chooseDispatchSize(const VkPhysicalDeviceLimits & limits)154 DispatchSize chooseDispatchSize(const VkPhysicalDeviceLimits& limits) {
155     // Compute the number of invocations along each dimension.
156     const uint32_t elementSize = sizeof(typename TestTypeHelper<dataType>::ElementType);
157     const uint32_t numberOfElementsPerInvocation = sizeof(uint32_t) / elementSize;
158     const uint32_t workgroupInvocationsX = kOperandSizeX / numberOfElementsPerInvocation;
159     const uint32_t workgroupInvocationsY = kOperandSizeY;
160 
161     // Make sure the workgroup size does not exceed the number of invocations along the X and Y
162     // dimensions.
163     uint32_t workgroupSize = std::min(workgroupInvocationsX, workgroupInvocationsY);
164 
165     // Make sure the workgroup size does not exceed the device limit along the X and Y dimensions.
166     workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[0]);
167     workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[1]);
168 
169     // Make sure the total number of invocations does not exceed the device limit.
170     uint32_t maxSquareWorkGroupSize =
171             static_cast<uint32_t>(std::sqrt(limits.maxComputeWorkGroupInvocations));
172     workgroupSize = std::min(workgroupSize, maxSquareWorkGroupSize);
173 
174     // Round down to a power of 2. This is to make sure workgroupInvocationsX and
175     // workgroupInvocationsY are divisible by the workgroup size so that we don't need to apply
176     // bound check in the shader.
177     uint32_t power = static_cast<uint32_t>(std::log2(static_cast<float>(workgroupSize)));
178     workgroupSize = 1u << power;
179     CHECK(workgroupInvocationsX % workgroupSize == 0);
180     CHECK(workgroupInvocationsY % workgroupSize == 0);
181 
182     return {
183             .workgroupSize = workgroupSize,
184             .groupCountX = workgroupInvocationsX / workgroupSize,
185             .groupCountY = workgroupInvocationsY / workgroupSize,
186     };
187 }
188 
189 // Find the first memory index that satisfies the requirements
190 // See VkAndroidHardwareBufferPropertiesANDROID::memoryTypeBits for the semantics of
191 // "memoryTypeBitsRequirement"
findMemoryType(const VkPhysicalDeviceMemoryProperties & properties,uint32_t memoryTypeBitsRequirement,VkDeviceSize sizeRequirement)192 std::optional<uint32_t> findMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
193                                        uint32_t memoryTypeBitsRequirement,
194                                        VkDeviceSize sizeRequirement) {
195     for (uint32_t memoryIndex = 0; memoryIndex < VK_MAX_MEMORY_TYPES; ++memoryIndex) {
196         const uint32_t memoryTypeBits = (1 << memoryIndex);
197         const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
198         const uint32_t heapIndex = properties.memoryTypes[memoryIndex].heapIndex;
199         const bool isLargeEnough = properties.memoryHeaps[heapIndex].size >= sizeRequirement;
200         if (isRequiredMemoryType && isLargeEnough) return memoryIndex;
201     }
202 
203     // failed to find memory type.
204     return std::nullopt;
205 }
206 
addBufferTransitionBarrier(VkCommandBuffer commandBuffer,VkBuffer buffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,VkAccessFlags srcAccessMask,VkAccessFlags dstAccessMask,uint32_t srcQueue,uint32_t dstQueue)207 void addBufferTransitionBarrier(VkCommandBuffer commandBuffer, VkBuffer buffer,
208                                 VkPipelineStageFlags srcStageMask,
209                                 VkPipelineStageFlags dstStageMask, VkAccessFlags srcAccessMask,
210                                 VkAccessFlags dstAccessMask, uint32_t srcQueue, uint32_t dstQueue) {
211     const VkBufferMemoryBarrier bufferBarrier = {
212             .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
213             .pNext = nullptr,
214             .srcAccessMask = srcAccessMask,
215             .dstAccessMask = dstAccessMask,
216             .srcQueueFamilyIndex = srcQueue,
217             .dstQueueFamilyIndex = dstQueue,
218             .buffer = buffer,
219             .offset = 0,
220             .size = VK_WHOLE_SIZE,
221     };
222     vkCmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, 0, 0, nullptr, 1,
223                          &bufferBarrier, 0, nullptr);
224 }
225 
allocateBlobAhwb(uint32_t size,uint64_t usage,AHardwareBuffer ** outAhwb)226 void allocateBlobAhwb(uint32_t size, uint64_t usage, AHardwareBuffer** outAhwb) {
227     AHardwareBuffer_Desc desc = {
228             .width = size,
229             .height = 1u,
230             .layers = 1u,
231             .format = AHARDWAREBUFFER_FORMAT_BLOB,
232             .usage = usage,
233     };
234     ASSERT_EQ(AHardwareBuffer_allocate(&desc, outAhwb), 0);
235 }
236 
237 using NameAndDevice = std::pair<const char*, const ANeuralNetworksDevice*>;
238 
getNnapiDevices(std::vector<NameAndDevice> * outDevices)239 void getNnapiDevices(std::vector<NameAndDevice>* outDevices) {
240     // Get the number of available NNAPI devices
241     uint32_t numDevices = 0;
242     ASSERT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
243 
244     std::vector<NameAndDevice> devices;
245     for (uint32_t i = 0; i < numDevices; i++) {
246         // Get device
247         ANeuralNetworksDevice* device;
248         ASSERT_EQ(ANeuralNetworks_getDevice(/*devIndex=*/i, &device), ANEURALNETWORKS_NO_ERROR);
249 
250         // Get device name
251         const char* deviceName = nullptr;
252         ASSERT_EQ(ANeuralNetworksDevice_getName(device, &deviceName), ANEURALNETWORKS_NO_ERROR);
253 
254         // Check device feature level. This test is added in NNAPI feature level 5, so skip if the
255         // device is of a lower feature level.
256         int64_t featureLevel;
257         ASSERT_EQ(ANeuralNetworksDevice_getFeatureLevel(device, &featureLevel),
258                   ANEURALNETWORKS_NO_ERROR);
259         if (featureLevel < ANEURALNETWORKS_FEATURE_LEVEL_5) {
260             continue;
261         }
262 
263         devices.emplace_back(deviceName, device);
264     }
265     *outDevices = std::move(devices);
266 }
267 
getNnapiDevices()268 std::vector<NameAndDevice> getNnapiDevices() {
269     std::vector<NameAndDevice> devices;
270     getNnapiDevices(&devices);
271     return devices;
272 }
273 
printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice> & info)274 std::string printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice>& info) {
275     std::string name = info.param.first;
276     // gtest test names must only contain alphanumeric characters
277     std::replace_if(
278             name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
279     return name;
280 }
281 
282 template <Type dataType>
283 class VulkanComputePipeline {
284    public:
285     // Returns the created object on success, or nullptr on failure.
create(AHardwareBuffer * output)286     static std::unique_ptr<VulkanComputePipeline> create(AHardwareBuffer* output) {
287         auto pipeline = std::make_unique<VulkanComputePipeline>();
288         pipeline->initialize(output);
289         return pipeline->mIsValid ? std::move(pipeline) : nullptr;
290     }
291 
~VulkanComputePipeline()292     ~VulkanComputePipeline() {
293         if (mDevice != VK_NULL_HANDLE) {
294             vkDestroyFence(mDevice, mFence, nullptr);
295             vkDestroyPipeline(mDevice, mPipeline, nullptr);
296             vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
297             vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
298             vkFreeMemory(mDevice, mOutputBufferMemory, nullptr);
299             vkDestroyBuffer(mDevice, mOutputBuffer, nullptr);
300             vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
301             vkDestroyCommandPool(mDevice, mCommandPool, nullptr);
302             vkDestroyDescriptorPool(mDevice, mDescriptorPool, nullptr);
303         }
304         vkDestroyDevice(mDevice, nullptr);
305         vkDestroyInstance(mInstance, nullptr);
306     }
307 
308     // Returns {success, sync_fd}
run()309     std::pair<bool, base::unique_fd> run() {
310         bool success = false;
311         base::unique_fd outSyncFd;
312         runInternal(&success, &outSyncFd);
313         return {success, std::move(outSyncFd)};
314     }
315 
316    private:
initialize(AHardwareBuffer * output)317     void initialize(AHardwareBuffer* output) {
318         // Create instance
319         const VkApplicationInfo applicationDesc = {
320                 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
321                 .pApplicationName = "TestGpuNnapi",
322                 .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
323                 .apiVersion = VK_API_VERSION_1_1,
324         };
325         const VkInstanceCreateInfo instanceDesc = {
326                 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
327                 .pApplicationInfo = &applicationDesc,
328                 .enabledLayerCount = 0,
329                 .ppEnabledLayerNames = nullptr,
330                 .enabledExtensionCount = 0,
331                 .ppEnabledExtensionNames = nullptr,
332         };
333         ASSERT_EQ(vkCreateInstance(&instanceDesc, nullptr, &mInstance), VK_SUCCESS);
334 
335         // Enumerate physical devices
336         uint32_t numberOfDevices = 0;
337         ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, nullptr), VK_SUCCESS);
338         std::vector<VkPhysicalDevice> physicalDevices(numberOfDevices);
339         ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, physicalDevices.data()),
340                   VK_SUCCESS);
341 
342         // Pick the first device with a compute queue
343         for (const auto& physicalDevice : physicalDevices) {
344             uint32_t numberOfQueueFamilies = 0;
345             vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
346                                                      nullptr);
347             std::vector<VkQueueFamilyProperties> queueFamilies(numberOfQueueFamilies);
348             vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
349                                                      queueFamilies.data());
350 
351             uint32_t pickedQueueFamilyIndex = 0;
352             bool hasComputeQueue = false;
353             for (uint32_t i = 0; i < queueFamilies.size(); i++) {
354                 if (queueFamilies[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
355                     pickedQueueFamilyIndex = i;
356                     hasComputeQueue = true;
357                     break;
358                 }
359             }
360             if (!hasComputeQueue) continue;
361             mPhysicalDevice = physicalDevice;
362             mQueueFamilyIndex = pickedQueueFamilyIndex;
363             break;
364         }
365         if (mPhysicalDevice == VK_NULL_HANDLE) {
366             GTEST_SKIP() << "No device can handle a compute queue";
367         }
368 
369         // Get physical device properties
370         vkGetPhysicalDeviceProperties(mPhysicalDevice, &mPhysicalDeviceProperties);
371         vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mPhysicalDeviceMemoryProperties);
372 
373         // Check physical device version
374         if (mPhysicalDeviceProperties.apiVersion < VK_API_VERSION_1_1) {
375             GTEST_SKIP() << "Device API version too low";
376         }
377 
378         // Check if the physical device is able to handle the compute work
379         const auto dispatchSize = chooseDispatchSize<dataType>(mPhysicalDeviceProperties.limits);
380         if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[0] <
381             dispatchSize.groupCountX) {
382             GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountX
383                          << " workgroups for the X dimension";
384         }
385         if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[1] <
386             dispatchSize.groupCountY) {
387             GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountY
388                          << " workgroups for the Y dimension";
389         }
390 
391         // Enumerate device extensions
392         uint32_t numberOfExtensions = 0;
393         ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
394                                                        &numberOfExtensions, nullptr),
395                   VK_SUCCESS);
396         std::vector<VkExtensionProperties> extensions(numberOfExtensions);
397         ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
398                                                        &numberOfExtensions, extensions.data()),
399                   VK_SUCCESS);
400 
401         // Required device extensions
402         std::vector<const char*> requiredDeviceExtensions = {
403                 // The following extensions are required to import an AHardwareBuffer to Vulkan
404                 VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
405                 VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME,
406                 VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
407                 VK_KHR_BIND_MEMORY_2_EXTENSION_NAME,
408                 VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
409                 // The following extensions are required to export a sync fence
410                 VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME,
411                 VK_KHR_MAINTENANCE1_EXTENSION_NAME,
412         };
413         for (const char* requiredDeviceExtension : requiredDeviceExtensions) {
414             if (!isExtensionSupported(extensions, requiredDeviceExtension)) {
415                 GTEST_SKIP() << "Device extension " << requiredDeviceExtension
416                              << " is not supported";
417             }
418         }
419 
420         // Check external memory properties
421         const VkPhysicalDeviceExternalBufferInfo externalBufferInfo = {
422                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
423                 .pNext = nullptr,
424                 .flags = 0u,
425                 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
426                 .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
427         };
428         VkExternalBufferProperties externalBufferProperties;
429         vkGetPhysicalDeviceExternalBufferProperties(mPhysicalDevice, &externalBufferInfo,
430                                                     &externalBufferProperties);
431         if (!(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
432               VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT)) {
433             GTEST_SKIP() << "Device is not able to import Android hardware buffer";
434         }
435         ASSERT_FALSE(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
436                      VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT);
437 
438         // Check external fence properties
439         const VkPhysicalDeviceExternalFenceInfo externalFenceInfo = {
440                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO,
441                 .pNext = nullptr,
442                 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
443         };
444         VkExternalFenceProperties externalFenceProperties;
445         vkGetPhysicalDeviceExternalFenceProperties(mPhysicalDevice, &externalFenceInfo,
446                                                    &externalFenceProperties);
447         if (!(externalFenceProperties.externalFenceFeatures &
448               VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT)) {
449             GTEST_SKIP() << "Device is not able to export Android sync fence FD";
450         }
451 
452         // Create logical device
453         const float queuePriority = 1.0f;
454         const VkDeviceQueueCreateInfo queueDesc = {
455                 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
456                 .queueFamilyIndex = mQueueFamilyIndex,
457                 .queueCount = 1,
458                 .pQueuePriorities = &queuePriority,
459         };
460         const VkDeviceCreateInfo deviceDesc = {
461                 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
462                 .queueCreateInfoCount = 1,
463                 .pQueueCreateInfos = &queueDesc,
464                 .enabledExtensionCount = static_cast<uint32_t>(requiredDeviceExtensions.size()),
465                 .ppEnabledExtensionNames = requiredDeviceExtensions.data(),
466                 .pEnabledFeatures = nullptr,
467         };
468         ASSERT_EQ(vkCreateDevice(mPhysicalDevice, &deviceDesc, nullptr, &mDevice), VK_SUCCESS);
469         vkGetDeviceQueue(mDevice, mQueueFamilyIndex, 0, &mQueue);
470 
471         // Get extension function pointers
472         mPfnVkGetFenceFdKHR = reinterpret_cast<PFN_vkGetFenceFdKHR>(
473                 vkGetDeviceProcAddr(mDevice, "vkGetFenceFdKHR"));
474         ASSERT_NE(mPfnVkGetFenceFdKHR, nullptr);
475 
476         // Create descriptor pool
477         const std::vector<VkDescriptorPoolSize> descriptorPoolSizes = {
478                 {
479                         .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
480                         .descriptorCount = 1,
481                 },
482         };
483         const VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
484                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
485                 .maxSets = 1,
486                 .poolSizeCount = static_cast<uint32_t>(descriptorPoolSizes.size()),
487                 .pPoolSizes = descriptorPoolSizes.data(),
488         };
489         ASSERT_EQ(vkCreateDescriptorPool(mDevice, &descriptorPoolCreateInfo, nullptr,
490                                          &mDescriptorPool),
491                   VK_SUCCESS);
492 
493         // Create descriptor set layout
494         const std::vector<VkDescriptorSetLayoutBinding> descriptorsetLayoutBinding = {
495                 {
496                         .binding = 0,  // output buffer
497                         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
498                         .descriptorCount = 1,
499                         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
500                 },
501 
502         };
503         const VkDescriptorSetLayoutCreateInfo descriptorsetLayoutDesc = {
504                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
505                 .bindingCount = static_cast<uint32_t>(descriptorsetLayoutBinding.size()),
506                 .pBindings = descriptorsetLayoutBinding.data(),
507         };
508         ASSERT_EQ(vkCreateDescriptorSetLayout(mDevice, &descriptorsetLayoutDesc, nullptr,
509                                               &mDescriptorSetLayout),
510                   VK_SUCCESS);
511 
512         // Allocate descriptor set
513         const VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
514                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
515                 .descriptorPool = mDescriptorPool,
516                 .descriptorSetCount = 1,
517                 .pSetLayouts = &mDescriptorSetLayout,
518         };
519         ASSERT_EQ(vkAllocateDescriptorSets(mDevice, &descriptorSetAllocateInfo, &mDescriptorSet),
520                   VK_SUCCESS);
521 
522         // Check the output AHardwareBuffer format and usage bits
523         AHardwareBuffer_Desc desc;
524         AHardwareBuffer_describe(output, &desc);
525         ASSERT_EQ(desc.format, AHARDWAREBUFFER_FORMAT_BLOB);
526         ASSERT_TRUE(desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER);
527 
528         // Get AHardwareBuffer properties
529         VkAndroidHardwareBufferPropertiesANDROID properties = {
530                 .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID,
531                 .pNext = nullptr,
532         };
533         ASSERT_EQ(vkGetAndroidHardwareBufferPropertiesANDROID(mDevice, output, &properties),
534                   VK_SUCCESS);
535 
536         // Create the output buffer with AHardwareBuffer memory
537         const VkExternalMemoryBufferCreateInfo externalMemoryBufferCreateInfo = {
538                 .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
539                 .pNext = nullptr,
540                 .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
541         };
542         const VkBufferCreateInfo bufferCreateInfo = {
543                 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
544                 .pNext = &externalMemoryBufferCreateInfo,
545                 .flags = 0u,
546                 .size = desc.width,
547                 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
548                 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
549                 .queueFamilyIndexCount = 0u,
550                 .pQueueFamilyIndices = nullptr,
551         };
552         ASSERT_EQ(vkCreateBuffer(mDevice, &bufferCreateInfo, nullptr, &mOutputBuffer), VK_SUCCESS);
553 
554         // Find a proper memory type
555         const auto maybeMemoryTypeIndex =
556                 findMemoryType(mPhysicalDeviceMemoryProperties, properties.memoryTypeBits,
557                                properties.allocationSize);
558         if (!maybeMemoryTypeIndex.has_value()) {
559             GTEST_SKIP() << "None of the memory type is suitable for allocation";
560         }
561 
562         // Import the AHardwareBuffer memory
563         const VkImportAndroidHardwareBufferInfoANDROID importMemoryAllocateInfo = {
564                 .sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID,
565                 .pNext = nullptr,
566                 .buffer = output,
567         };
568         const VkMemoryAllocateInfo memoryAllocInfo = {
569                 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
570                 .pNext = &importMemoryAllocateInfo,
571                 .allocationSize = properties.allocationSize,
572                 .memoryTypeIndex = maybeMemoryTypeIndex.value(),
573         };
574         const auto allocationResult =
575                 vkAllocateMemory(mDevice, &memoryAllocInfo, nullptr, &mOutputBufferMemory);
576         // Memory allocation may fail if the size exceeds the upper limit of a single allocation
577         // that the platform supports
578         if (allocationResult == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
579             GTEST_SKIP() << "Unable to allocate device memory of " << properties.allocationSize
580                          << " bytes";
581         }
582         ASSERT_EQ(allocationResult, VK_SUCCESS);
583 
584         // Bind the memory with the buffer
585         ASSERT_EQ(vkBindBufferMemory(mDevice, mOutputBuffer, mOutputBufferMemory, 0), VK_SUCCESS);
586 
587         // Update the descriptor sets
588         const VkDescriptorBufferInfo outputBufferDesc = {
589                 .buffer = mOutputBuffer,
590                 .offset = 0,
591                 .range = VK_WHOLE_SIZE,
592         };
593         const std::vector<VkWriteDescriptorSet> writeDst = {
594                 {
595                         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
596                         .pNext = nullptr,
597                         .dstSet = mDescriptorSet,
598                         .dstBinding = 0,  // output buffer
599                         .dstArrayElement = 0,
600                         .descriptorCount = 1,
601                         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
602                         .pImageInfo = nullptr,
603                         .pBufferInfo = &outputBufferDesc,
604                         .pTexelBufferView = nullptr,
605                 },
606         };
607         vkUpdateDescriptorSets(mDevice, writeDst.size(), writeDst.data(), 0, nullptr);
608 
609         // Create shader module
610         const VkShaderModuleCreateInfo shaderDesc = {
611                 .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
612                 .flags = 0,
613                 .codeSize = kComputeShader.size() * sizeof(uint32_t),
614                 .pCode = kComputeShader.data(),
615         };
616         ASSERT_EQ(vkCreateShaderModule(mDevice, &shaderDesc, nullptr, &mShaderModule), VK_SUCCESS);
617 
618         // Create pipeline layout
619         const VkPipelineLayoutCreateInfo layoutDesc = {
620                 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
621                 .setLayoutCount = 1,
622                 .pSetLayouts = &mDescriptorSetLayout,
623                 .pushConstantRangeCount = 0,
624                 .pPushConstantRanges = nullptr,
625         };
626         ASSERT_EQ(vkCreatePipelineLayout(mDevice, &layoutDesc, nullptr, &mPipelineLayout),
627                   VK_SUCCESS);
628 
629         // Create compute pipeline
630         const uint32_t specializationData[] = {
631                 dispatchSize.workgroupSize,            // local_size_x
632                 dispatchSize.workgroupSize,            // local_size_y
633                 TestTypeHelper<dataType>::kClearData,  // CLEAR_DATA
634         };
635         const std::vector<VkSpecializationMapEntry> specializationMap = {
636                 // {constantID, offset, size}
637                 {0, 0 * sizeof(uint32_t), sizeof(uint32_t)},
638                 {1, 1 * sizeof(uint32_t), sizeof(uint32_t)},
639                 {2, 2 * sizeof(uint32_t), sizeof(uint32_t)},
640         };
641         const VkSpecializationInfo specializationInfo = {
642                 .mapEntryCount = static_cast<uint32_t>(specializationMap.size()),
643                 .pMapEntries = specializationMap.data(),
644                 .dataSize = sizeof(specializationData),
645                 .pData = specializationData,
646         };
647         const VkComputePipelineCreateInfo pipelineDesc = {
648                 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
649                 .stage =
650                         {
651                                 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
652                                 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
653                                 .module = mShaderModule,
654                                 .pName = "main",
655                                 .pSpecializationInfo = &specializationInfo,
656                         },
657                 .layout = mPipelineLayout,
658         };
659         ASSERT_EQ(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipelineDesc, nullptr,
660                                            &mPipeline),
661                   VK_SUCCESS);
662 
663         // Create command pool
664         const VkCommandPoolCreateInfo cmdpoolDesc = {
665                 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
666                 .flags = 0u,
667                 .queueFamilyIndex = mQueueFamilyIndex,
668         };
669         ASSERT_EQ(vkCreateCommandPool(mDevice, &cmdpoolDesc, nullptr, &mCommandPool), VK_SUCCESS);
670 
671         // Create a command buffer
672         const VkCommandBufferAllocateInfo cmdBufferCreateInfo = {
673                 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
674                 .pNext = nullptr,
675                 .commandPool = mCommandPool,
676                 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
677                 .commandBufferCount = 1,
678         };
679         ASSERT_EQ(vkAllocateCommandBuffers(mDevice, &cmdBufferCreateInfo, &mCommandBuffer),
680                   VK_SUCCESS);
681 
682         // Record command buffer
683         const VkCommandBufferBeginInfo commandBufferBeginInfo = {
684                 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
685                 .pNext = nullptr,
686                 .flags = 0,
687                 .pInheritanceInfo = nullptr,
688         };
689         ASSERT_EQ(vkBeginCommandBuffer(mCommandBuffer, &commandBufferBeginInfo), VK_SUCCESS);
690 
691         // Buffer barrier to acquire the ownership of the output buffer
692         addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
693                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
694                                    VK_ACCESS_SHADER_WRITE_BIT, VK_QUEUE_FAMILY_FOREIGN_EXT,
695                                    mQueueFamilyIndex);
696 
697         // Setup resources
698         vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
699         vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipelineLayout, 0,
700                                 1, &mDescriptorSet, 0, nullptr);
701 
702         // Dispatch compute
703         vkCmdDispatch(mCommandBuffer, dispatchSize.groupCountX, dispatchSize.groupCountY, 1);
704 
705         // Buffer barrier to release the ownership of the output buffer
706         addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer,
707                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
708                                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
709                                    0, mQueueFamilyIndex, VK_QUEUE_FAMILY_FOREIGN_EXT);
710 
711         // Finish recording the command buffer
712         ASSERT_EQ(vkEndCommandBuffer(mCommandBuffer), VK_SUCCESS);
713 
714         // Create fence
715         const VkExportFenceCreateInfo exportFenceCreateInfo = {
716                 .sType = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO,
717                 .pNext = nullptr,
718                 .handleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
719         };
720         const VkFenceCreateInfo fenceCreateInfo = {
721                 .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
722                 .pNext = &exportFenceCreateInfo,
723                 .flags = 0,
724         };
725         ASSERT_EQ(vkCreateFence(mDevice, &fenceCreateInfo, nullptr, &mFence), VK_SUCCESS);
726 
727         mIsValid = true;
728     }
729 
runInternal(bool * outSuccess,base::unique_fd * outSyncFd)730     void runInternal(bool* outSuccess, base::unique_fd* outSyncFd) {
731         *outSuccess = false;
732 
733         // Submit to queue
734         const VkSubmitInfo submitInfo = {
735                 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
736                 .waitSemaphoreCount = 0,
737                 .pWaitSemaphores = nullptr,
738                 .pWaitDstStageMask = nullptr,
739                 .commandBufferCount = 1,
740                 .pCommandBuffers = &mCommandBuffer,
741                 .signalSemaphoreCount = 0,
742                 .pSignalSemaphores = nullptr,
743         };
744         ASSERT_EQ(vkResetFences(mDevice, 1, &mFence), VK_SUCCESS);
745         ASSERT_EQ(vkQueueSubmit(mQueue, 1, &submitInfo, mFence), VK_SUCCESS);
746 
747         // Export a Android sync fence FD
748         int syncFd = -1;
749         const VkFenceGetFdInfoKHR fenceGetFdInfo = {
750                 .sType = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR,
751                 .pNext = nullptr,
752                 .fence = mFence,
753                 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
754         };
755         ASSERT_EQ(mPfnVkGetFenceFdKHR(mDevice, &fenceGetFdInfo, &syncFd), VK_SUCCESS);
756         *outSyncFd = base::unique_fd(syncFd);
757 
758         *outSuccess = true;
759     }
760 
761     // Instance
762     VkInstance mInstance = VK_NULL_HANDLE;
763 
764     // Physical device and queue family
765     VkPhysicalDevice mPhysicalDevice = VK_NULL_HANDLE;
766     VkPhysicalDeviceProperties mPhysicalDeviceProperties{};
767     VkPhysicalDeviceMemoryProperties mPhysicalDeviceMemoryProperties{};
768     uint32_t mQueueFamilyIndex = 0;
769 
770     // Logical device and queue
771     VkDevice mDevice = VK_NULL_HANDLE;
772     VkQueue mQueue = VK_NULL_HANDLE;
773 
774     // Extension functions
775     PFN_vkGetFenceFdKHR mPfnVkGetFenceFdKHR = nullptr;
776 
777     // Resource descriptors
778     VkDescriptorPool mDescriptorPool = VK_NULL_HANDLE;
779     VkDescriptorSetLayout mDescriptorSetLayout = VK_NULL_HANDLE;
780     VkDescriptorSet mDescriptorSet = VK_NULL_HANDLE;
781 
782     // Output buffer
783     VkBuffer mOutputBuffer = VK_NULL_HANDLE;
784     VkDeviceMemory mOutputBufferMemory = VK_NULL_HANDLE;
785 
786     // Compute pipeline
787     VkShaderModule mShaderModule = VK_NULL_HANDLE;
788     VkPipelineLayout mPipelineLayout = VK_NULL_HANDLE;
789     VkPipeline mPipeline = VK_NULL_HANDLE;
790 
791     // Command buffer
792     VkCommandPool mCommandPool = VK_NULL_HANDLE;
793     VkCommandBuffer mCommandBuffer = VK_NULL_HANDLE;
794     VkFence mFence = VK_NULL_HANDLE;
795 
796     bool mIsValid = false;
797 };
798 
799 template <Type dataType>
800 class NnapiExecutor {
801    public:
802     // Returns the created object on success, or nullptr on failure.
create(const ANeuralNetworksDevice * device,AHardwareBuffer * input,AHardwareBuffer * output)803     static std::unique_ptr<NnapiExecutor> create(const ANeuralNetworksDevice* device,
804                                                  AHardwareBuffer* input, AHardwareBuffer* output) {
805         auto nnapi = std::make_unique<NnapiExecutor>(input, output);
806         nnapi->initialize(device);
807         return nnapi->mIsValid ? std::move(nnapi) : nullptr;
808     }
809 
810     // Prefer NnapiExecutor::create
NnapiExecutor(AHardwareBuffer * input,AHardwareBuffer * output)811     NnapiExecutor(AHardwareBuffer* input, AHardwareBuffer* output)
812         : mInputMemory(input), mOutputMemory(output) {}
813 
814     // Returns {success, sync_fd}
run(const base::unique_fd & inSyncFd)815     std::pair<bool, base::unique_fd> run(const base::unique_fd& inSyncFd) {
816         bool success = false;
817         base::unique_fd outSyncFd;
818         runInternal(inSyncFd, &success, &outSyncFd);
819         return {success, std::move(outSyncFd)};
820     }
821 
822    private:
823     using ElementType = typename TestTypeHelper<dataType>::ElementType;
824 
initialize(const ANeuralNetworksDevice * device)825     void initialize(const ANeuralNetworksDevice* device) {
826         ASSERT_TRUE(mInputMemory.isValid());
827         ASSERT_TRUE(mOutputMemory.isValid());
828 
829         // Model input
830         const float scale = TestTypeHelper<dataType>::kIsQuantized ? 1.0f : 0.0f;
831         const OperandType tensorType(dataType, {kOperandSizeY, kOperandSizeX}, scale,
832                                      /*zeroPoint=*/0);
833         uint32_t inputTensor = mModel.addOperand(&tensorType);
834 
835         // Constant tensor
836         const OperandType constTensorType(dataType, {1}, scale, /*zeroPoint=*/0);
837         const ElementType constTensorData = static_cast<ElementType>(1);
838         uint32_t constTensor =
839                 mModel.addConstantOperand<ElementType>(&constTensorType, constTensorData);
840 
841         // Activation (NONE)
842         const OperandType activationType(Type::INT32, {});
843         uint32_t activationScalar = mModel.addConstantOperand<int32_t>(&activationType, 0);
844 
845         // Model output
846         uint32_t outputTensor = mModel.addOperand(&tensorType);
847 
848         // Model operation
849         mModel.addOperation(ANEURALNETWORKS_ADD, {inputTensor, constTensor, activationScalar},
850                             {outputTensor});
851 
852         // Finish model
853         mModel.identifyInputsAndOutputs({inputTensor}, {outputTensor});
854         mModel.relaxComputationFloat32toFloat16(/*isRelax=*/true);
855         ASSERT_TRUE(mModel.isValid());
856         ASSERT_EQ(mModel.finish(), Result::NO_ERROR);
857 
858         // Create compilation for the target device
859         Result result;
860         std::tie(result, mCompilation) =
861                 test_wrapper::Compilation::createForDevice(&mModel, device);
862         ASSERT_EQ(result, Result::NO_ERROR);
863 
864         // Finish the compilation
865         result = mCompilation.finish();
866         if (result != Result::NO_ERROR) {
867             GTEST_SKIP() << "Model is not supported by the device";
868         }
869 
870         mIsValid = true;
871     }
872 
runInternal(const base::unique_fd & inSyncFd,bool * outSuccess,base::unique_fd * outSyncFd)873     void runInternal(const base::unique_fd& inSyncFd, bool* outSuccess,
874                      base::unique_fd* outSyncFd) {
875         *outSuccess = false;
876 
877         // Setup execution
878         mExecution = std::make_unique<test_wrapper::Execution>(&mCompilation);
879         ASSERT_EQ(mExecution->setInputFromMemory(/*index=*/0, &mInputMemory, /*offset=*/0,
880                                                  kOperandLength * sizeof(ElementType)),
881                   Result::NO_ERROR);
882         ASSERT_EQ(mExecution->setOutputFromMemory(/*index=*/0, &mOutputMemory, /*offset=*/0,
883                                                   kOperandLength * sizeof(ElementType)),
884                   Result::NO_ERROR);
885 
886         // Setup dependencies
887         std::vector<const test_wrapper::Event*> dependencies;
888         test_wrapper::Event start;
889         // The sync fence from Vulkan may not be valid if GPU workload has already finished
890         // prior to exporting the fence.
891         if (inSyncFd.ok()) {
892             start = test_wrapper::Event(inSyncFd.get());
893             ASSERT_TRUE(start.isValid());
894             dependencies = {&start};
895         }
896 
897         // Fenced compute
898         test_wrapper::Event finished;
899         mExecution->startComputeWithDependencies(dependencies, /*infinite timeout*/ 0, &finished);
900 
901         // Get the output sync fence if supported; Otherwise, wait until the execution is finished
902         int syncFd = -1;
903         finished.getSyncFenceFd(&syncFd);
904         if (syncFd == -1) {
905             ASSERT_EQ(finished.wait(), Result::NO_ERROR);
906         }
907         *outSyncFd = base::unique_fd(syncFd);
908         *outSuccess = true;
909     }
910 
911     test_wrapper::Model mModel;
912     test_wrapper::Compilation mCompilation;
913     std::unique_ptr<test_wrapper::Execution> mExecution;
914     test_wrapper::Memory mInputMemory, mOutputMemory;
915     bool mIsValid = false;
916 };
917 
918 class GpuNnapiTest : public testing::TestWithParam<NameAndDevice> {
919    protected:
TearDown()920     void TearDown() override {
921         if (mGpuOutput) {
922             AHardwareBuffer_release(mGpuOutput);
923         }
924         if (mNnapiOutput) {
925             AHardwareBuffer_release(mNnapiOutput);
926         }
927     }
928 
929     template <Type dataType>
runTest()930     void runTest() {
931 #ifndef NNTEST_ONLY_PUBLIC_API
932         if (DeviceManager::get()->getUseCpuOnly()) {
933             GTEST_SKIP();
934         }
935 #endif
936 
937         // Allocate hardware buffers for GPU and NNAPI outputs
938         const size_t size = kOperandLength * sizeof(typename TestTypeHelper<dataType>::ElementType);
939         allocateBlobAhwb(
940                 size, AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
941                 &mGpuOutput);
942         allocateBlobAhwb(
943                 size, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
944                 &mNnapiOutput);
945         if (mGpuOutput == nullptr || mNnapiOutput == nullptr) return;
946 
947         // Create Vulkan compute pipeline
948         auto vulkan = VulkanComputePipeline<dataType>::create(mGpuOutput);
949         if (vulkan == nullptr) return;
950 
951         // Create NNAPI executor
952         auto nnapi = NnapiExecutor<dataType>::create(kDevice, mGpuOutput, mNnapiOutput);
953         if (nnapi == nullptr) return;
954 
955         // Run the test repeatly for kNumberOfIterationsToTest iterations
956         for (uint32_t i = 0; i < kNumberOfIterationsToTest; i++) {
957             auto [gpuSuccess, gpuSyncFd] = vulkan->run();
958             ASSERT_TRUE(gpuSuccess);
959 
960             auto [nnapiSuccess, nnapiSyncFd] = nnapi->run(gpuSyncFd);
961             ASSERT_TRUE(nnapiSuccess);
962 
963             checkResults<dataType>(std::move(nnapiSyncFd));
964         }
965     }
966 
967     template <Type dataType>
checkResults(base::unique_fd syncFd)968     void checkResults(base::unique_fd syncFd) {
969         using ElementType = typename TestTypeHelper<dataType>::ElementType;
970 
971         // Lock the buffer with the sync fence
972         // AHardwareBuffer_lock will take the ownership and close the sync fence even on errors
973         void* data;
974         ASSERT_EQ(AHardwareBuffer_lock(mNnapiOutput, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
975                                        syncFd.release(), /*rect=*/nullptr, &data),
976                   0);
977 
978         // Compare the actual results with the expect value
979         uint32_t numberOfErrors = 0;
980         const ElementType expected = static_cast<ElementType>(kExpectedResultInInt);
981         for (uint32_t i = 0; i < kOperandLength; i++) {
982             const ElementType actual = reinterpret_cast<ElementType*>(data)[i];
983 
984             // We expect bit-exact here because the arithmetic is trivial, and all intermediate
985             // and final results can be exactly represented by the primary data type.
986             if (actual != expected) {
987                 // Print at most kMaxNumberOfPrintedErrors errors by EXPECT_EQ
988                 if (numberOfErrors < kMaxNumberOfPrintedErrors) {
989                     EXPECT_EQ(actual, expected)
990                             << "When comparing element [" << kOperandLength / kOperandSizeX << ", "
991                             << kOperandLength % kOperandSizeX << "]";
992                 }
993                 numberOfErrors++;
994             }
995         }
996         EXPECT_EQ(numberOfErrors, 0u);
997         ASSERT_EQ(AHardwareBuffer_unlock(mNnapiOutput, /*fence=*/nullptr), 0);
998     }
999 
1000     // The NNAPI device under test
1001     const ANeuralNetworksDevice* kDevice = GetParam().second;
1002 
1003     AHardwareBuffer* mGpuOutput = nullptr;
1004     AHardwareBuffer* mNnapiOutput = nullptr;
1005 };
1006 
TEST_P(GpuNnapiTest,Float32)1007 TEST_P(GpuNnapiTest, Float32) {
1008     runTest<Type::TENSOR_FLOAT32>();
1009 }
TEST_P(GpuNnapiTest,Float16)1010 TEST_P(GpuNnapiTest, Float16) {
1011     runTest<Type::TENSOR_FLOAT16>();
1012 }
TEST_P(GpuNnapiTest,Quant8Asymm)1013 TEST_P(GpuNnapiTest, Quant8Asymm) {
1014     runTest<Type::TENSOR_QUANT8_ASYMM>();
1015 }
TEST_P(GpuNnapiTest,Quant8AsymmSigned)1016 TEST_P(GpuNnapiTest, Quant8AsymmSigned) {
1017     runTest<Type::TENSOR_QUANT8_ASYMM_SIGNED>();
1018 }
1019 
1020 INSTANTIATE_TEST_SUITE_P(TestGpuNnapi, GpuNnapiTest, testing::ValuesIn(getNnapiDevices()),
1021                          printGpuNnapiTest);
1022 
1023 }  // namespace
1024 }  // namespace android::nn
1025