• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #pragma once
6 
7 #include <CommonTestUtils.hpp>
8 
9 #include <armnn/Descriptors.hpp>
10 #include <armnn/INetwork.hpp>
11 #include <armnn/IRuntime.hpp>
12 
13 #include <Profiling.hpp>
14 #include <armnnUtils/QuantizeHelper.hpp>
15 #include <ResolveType.hpp>
16 
17 #include <doctest/doctest.h>
18 
19 #include <vector>
20 
21 namespace
22 {
23 
24 using namespace armnn;
25 
26 template<typename T>
ConstantUsageTest(const std::vector<BackendId> & computeDevice,const TensorInfo & commonTensorInfo,const std::vector<T> & inputData,const std::vector<T> & constantData,const std::vector<T> & expectedOutputData)27 bool ConstantUsageTest(const std::vector<BackendId>& computeDevice,
28                        const TensorInfo& commonTensorInfo,
29                        const std::vector<T>& inputData,
30                        const std::vector<T>& constantData,
31                        const std::vector<T>& expectedOutputData)
32 {
33     // Create runtime in which test will run
34     IRuntime::CreationOptions options;
35     IRuntimePtr runtime(IRuntime::Create(options));
36 
37     // Builds up the structure of the network.
38     INetworkPtr net(INetwork::Create());
39 
40     IConnectableLayer* input = net->AddInputLayer(0);
41     IConnectableLayer* constant = net->AddConstantLayer(ConstTensor(commonTensorInfo, constantData));
42     ARMNN_NO_DEPRECATE_WARN_BEGIN
43     IConnectableLayer* add = net->AddAdditionLayer();
44     ARMNN_NO_DEPRECATE_WARN_END
45     IConnectableLayer* output = net->AddOutputLayer(0);
46 
47     input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
48     constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
49     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
50 
51     // Sets the tensors in the network.
52     input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
53     constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
54     add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
55 
56     // optimize the network
57     IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());
58 
59     // Loads it into the runtime.
60     NetworkId netId;
61     std::string errorMessage;
62     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage);
63     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
64 
65     // Creates structures for input & output.
66     std::vector<T> outputData(inputData.size());
67 
68     InputTensors inputTensors
69     {
70         {0, ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
71     };
72     OutputTensors outputTensors
73     {
74         {0, Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
75     };
76 
77     // Does the inference.
78     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
79 
80     // Checks the results.
81     return outputData == expectedOutputData;
82 }
83 
ConstantUsageFloat32Test(const std::vector<BackendId> & backends)84 inline bool ConstantUsageFloat32Test(const std::vector<BackendId>& backends)
85 {
86     TensorInfo commonTensorInfo({ 2, 3 }, DataType::Float32);
87     commonTensorInfo.SetConstant(true);
88 
89     return ConstantUsageTest(backends,
90         commonTensorInfo,
91         std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
92         std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
93         std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }  // Expected output.
94     );
95 }
96 
ConstantUsageUint8Test(const std::vector<BackendId> & backends)97 inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends)
98 {
99     TensorInfo commonTensorInfo({ 2, 3 }, DataType::QAsymmU8);
100 
101     const float scale = 0.023529f;
102     const int8_t offset = -43;
103 
104     commonTensorInfo.SetQuantizationScale(scale);
105     commonTensorInfo.SetQuantizationOffset(offset);
106     commonTensorInfo.SetConstant(true);
107 
108     return ConstantUsageTest(backends,
109         commonTensorInfo,
110         armnnUtils::QuantizedVector<uint8_t>({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, scale, offset), // Input.
111         armnnUtils::QuantizedVector<uint8_t>({ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, scale, offset), // Const input.
112         armnnUtils::QuantizedVector<uint8_t>({ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }, scale, offset)  // Expected output.
113     );
114 }
115 
116 // Utility function to find the number of instances of a substring within a string.
SubStringCounter(std::string & string,std::string && substring)117 int SubStringCounter(std::string& string, std::string&& substring)
118 {
119     std::size_t found = 0;
120     int count = 0;
121     // Look for the substring starting from where we last found the substring
122     while((found = string.find(substring, found)) != std::string::npos)
123     {
124         count++;
125         // Offset by substring length to avoid finding the same substring twice
126         found += substring.length();
127     }
128     return count;
129 }
130 
131 template<DataType ArmnnIType, DataType ArmnnOType,
132          typename TInput = ResolveType<ArmnnIType>, typename TOutput = ResolveType<ArmnnOType>>
EndToEndLayerTestImpl(INetworkPtr network,const std::map<int,std::vector<TInput>> & inputTensorData,const std::map<int,std::vector<TOutput>> & expectedOutputData,std::vector<BackendId> backends,float tolerance=0.000001f)133 void EndToEndLayerTestImpl(INetworkPtr network,
134                            const std::map<int, std::vector<TInput>>& inputTensorData,
135                            const std::map<int, std::vector<TOutput>>& expectedOutputData,
136                            std::vector<BackendId> backends,
137                            float tolerance = 0.000001f)
138 {
139     // Create runtime in which test will run
140     IRuntime::CreationOptions options;
141     IRuntimePtr runtime(IRuntime::Create(options));
142 
143     // optimize the network
144     IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
145 
146     // Loads it into the runtime.
147     NetworkId netId;
148     std::string errorMessage;
149     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage);
150     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
151 
152     InputTensors inputTensors;
153     inputTensors.reserve(inputTensorData.size());
154     for (auto&& it : inputTensorData)
155     {
156         inputTensors.push_back({it.first,
157                                 ConstTensor(runtime->GetInputTensorInfo(netId, it.first), it.second.data())});
158     }
159     OutputTensors outputTensors;
160     outputTensors.reserve(expectedOutputData.size());
161     std::map<int, std::vector<TOutput>> outputStorage;
162     for (auto&& it : expectedOutputData)
163     {
164         std::vector<TOutput> out(it.second.size());
165         outputStorage.emplace(it.first, out);
166         outputTensors.push_back({it.first,
167                                  Tensor(runtime->GetOutputTensorInfo(netId, it.first),
168                                                outputStorage.at(it.first).data())});
169     }
170 
171     // Does the inference.
172     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
173 
174     // Checks the results.
175     for (auto&& it : expectedOutputData)
176     {
177         std::vector<TOutput> out = outputStorage.at(it.first);
178         for (unsigned int i = 0; i < out.size(); ++i)
179         {
180             CHECK_MESSAGE(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true,
181                     "Actual output: " << out[i] << ". Expected output:" << it.second[i]);
182 
183         }
184     }
185 }
186 
ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)187 inline void ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)
188 {
189     using namespace armnn;
190 
191     // Create runtime in which test will run
192     IRuntime::CreationOptions options;
193     IRuntimePtr runtime(armnn::IRuntime::Create(options));
194 
195     // build up the structure of the network
196     INetworkPtr net(INetwork::Create());
197 
198     IConnectableLayer* input = net->AddInputLayer(0);
199 
200     ActivationDescriptor descriptor;
201     descriptor.m_Function = ActivationFunction::Square;
202     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
203 
204     IConnectableLayer* output = net->AddOutputLayer(0);
205 
206     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
207     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
208 
209     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
210     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
211 
212     // Optimize the network
213     OptimizerOptionsOpaque optimizedOptions;
214     optimizedOptions.SetImportEnabled(true);
215     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
216     CHECK(optNet);
217 
218     // Loads it into the runtime.
219     NetworkId netId;
220     std::string errorMessage;
221     // Enable Importing
222     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
223     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
224     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
225 
226     // Creates structures for input & output
227     std::vector<float> inputData
228     {
229         1.0f, 2.0f, 3.0f, 4.0f
230     };
231 
232     // Misaligned input
233     float* misalignedInputData = reinterpret_cast<float*>(reinterpret_cast<char*>(inputData.data()) + 1);
234 
235     std::vector<float> outputData(4);
236 
237     // Aligned output
238     float* alignedOutputData = outputData.data();
239 
240     InputTensors inputTensors
241     {
242         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputData)},
243     };
244     OutputTensors outputTensors
245     {
246         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputData)}
247     };
248 
249     runtime->GetProfiler(netId)->EnableProfiling(true);
250 
251     // Do the inference and expect it to fail with a ImportMemoryException
252     CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
253 }
254 
ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)255 inline void ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)
256 {
257     using namespace armnn;
258 
259     // Create runtime in which test will run
260     IRuntime::CreationOptions options;
261     IRuntimePtr runtime(armnn::IRuntime::Create(options));
262 
263     // build up the structure of the network
264     INetworkPtr net(INetwork::Create());
265 
266     IConnectableLayer* input = net->AddInputLayer(0);
267 
268     ActivationDescriptor descriptor;
269     descriptor.m_Function = ActivationFunction::Square;
270     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
271 
272     IConnectableLayer* output = net->AddOutputLayer(0);
273 
274     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
275     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
276 
277     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
278     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
279 
280     // Optimize the network
281     OptimizerOptionsOpaque optimizedOptions;
282     optimizedOptions.SetImportEnabled(true);
283     optimizedOptions.SetExportEnabled(true);
284     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
285     CHECK(optNet);
286 
287     // Loads it into the runtime.
288     NetworkId netId;
289     std::string errorMessage;
290     // Enable Importing and Exporting
291     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
292     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
293     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
294 
295     // Creates structures for input & output
296     std::vector<float> inputData
297     {
298         1.0f, 2.0f, 3.0f, 4.0f, 5.0f
299     };
300 
301     // Aligned input
302     float* alignedInputData = inputData.data();
303 
304     std::vector<float> outputData(5);
305 
306     // Misaligned output
307     float* misalignedOutputData = reinterpret_cast<float*>(reinterpret_cast<char*>(outputData.data()) + 1);
308 
309     InputTensors inputTensors
310     {
311         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputData)},
312     };
313     OutputTensors outputTensors
314     {
315         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputData)}
316     };
317 
318     // Do the inference and expect it to fail with a ExportMemoryException
319     if (backends[0] == Compute::CpuAcc)
320     {
321         // For CpuAcc the NeonTensorHandle will throw its own exception on misaligned memory
322         CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
323     }
324     else
325     {
326         CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryExportException);
327     }
328 }
329 
ImportAlignedPointerTest(std::vector<BackendId> backends)330 inline void ImportAlignedPointerTest(std::vector<BackendId> backends)
331 {
332     using namespace armnn;
333 
334     // Create runtime in which test will run
335     IRuntime::CreationOptions options;
336     IRuntimePtr runtime(armnn::IRuntime::Create(options));
337 
338     // build up the structure of the network
339     INetworkPtr net(INetwork::Create());
340 
341     IConnectableLayer* input = net->AddInputLayer(0);
342 
343     ActivationDescriptor descriptor;
344     descriptor.m_Function = ActivationFunction::Square;
345     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
346 
347     IConnectableLayer* output = net->AddOutputLayer(0);
348 
349     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
350     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
351 
352     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
353     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
354 
355     // Optimize the network
356     OptimizerOptionsOpaque optimizedOptions;
357     optimizedOptions.SetImportEnabled(true);
358     optimizedOptions.SetExportEnabled(true);
359     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
360     CHECK(optNet);
361 
362     // Loads it into the runtime.
363     NetworkId netId;
364     std::string errorMessage;
365     // Enable Importing
366     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
367     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
368     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
369 
370     // Creates structures for input & output
371     std::vector<float> inputData
372     {
373         1.0f, 2.0f, 3.0f, 4.0f
374     };
375 
376     std::vector<float> outputData(4);
377 
378     std::vector<float> expectedOutput
379     {
380         1.0f, 4.0f, 9.0f, 16.0f
381     };
382 
383     InputTensors inputTensors
384     {
385         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
386     };
387     OutputTensors outputTensors
388     {
389         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
390     };
391 
392     runtime->GetProfiler(netId)->EnableProfiling(true);
393 
394     // Do the inference
395     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
396 
397     // Retrieve the Profiler.Print() output to get the workload execution
398     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
399     std::stringstream ss;
400     profilerManager.GetProfiler()->Print(ss);
401     std::string dump = ss.str();
402 
403     // Contains ActivationWorkload
404     std::size_t found = dump.find("ActivationWorkload");
405     CHECK(found != std::string::npos);
406 
407     // Contains SyncMemGeneric
408     found = dump.find("SyncMemGeneric");
409     CHECK(found != std::string::npos);
410 
411     // Does not contain CopyMemGeneric
412     found = dump.find("CopyMemGeneric");
413     CHECK(found == std::string::npos);
414 
415     // Check output is as expected
416     CHECK(outputData == expectedOutput);
417 }
418 
ImportOnlyWorkload(std::vector<BackendId> backends)419 inline void ImportOnlyWorkload(std::vector<BackendId> backends)
420 {
421     using namespace armnn;
422 
423     IRuntime::CreationOptions options;
424     IRuntimePtr runtime(IRuntime::Create(options));
425 
426     // Builds up the structure of the network.
427     INetworkPtr net(INetwork::Create());
428 
429     IConnectableLayer* input = net->AddInputLayer(0);
430 
431     ActivationDescriptor descriptor;
432     descriptor.m_Function = ActivationFunction::Square;
433     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
434 
435     IConnectableLayer* output = net->AddOutputLayer(0);
436 
437     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
438     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
439 
440     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
441     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
442 
443     // optimize the network
444     OptimizerOptionsOpaque optimizedOptions;
445     optimizedOptions.SetImportEnabled(true);
446     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
447 
448     INFO("Load Network");
449     // Load it into the runtime. It should pass.
450     NetworkId netId;
451     std::string errorMessage;
452     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
453     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
454     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
455 
456     INFO("Generate Data");
457     // Creates structures for input & output
458     std::vector<float> inputData
459     {
460         1.0f, 2.0f, 3.0f, 4.0f
461     };
462 
463     std::vector<float> outputData(4);
464 
465     std::vector<float> expectedOutput
466     {
467          1.0f, 4.0f, 9.0f, 16.0f
468     };
469 
470     INFO("Create Inference");
471 
472     InputTensors inputTensors
473     {
474         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
475     };
476     OutputTensors outputTensors
477     {
478         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
479     };
480 
481     INFO("Get Profiler");
482     runtime->GetProfiler(netId)->EnableProfiling(true);
483 
484     INFO("Run Inference");
485     // Do the inference
486     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
487 
488     INFO("Print Profiler");
489     // Retrieve the Profiler.Print() output to get the workload execution
490     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
491     std::stringstream ss;
492     profilerManager.GetProfiler()->Print(ss);
493     std::string dump = ss.str();
494 
495     // Check there are no SyncMemGeneric workloads as we didn't export
496     INFO("Find SyncMemGeneric");
497     int count = SubStringCounter(dump, "SyncMemGeneric");
498     CHECK(count == 0);
499 
500     // Should only be 1 CopyMemGeneric for the output as we imported
501     INFO("Find CopyMemGeneric");
502     count = SubStringCounter(dump, "CopyMemGeneric");
503     CHECK(count == 1);
504 
505     // Check the output is correct
506     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
507 }
508 
ExportOnlyWorkload(std::vector<BackendId> backends)509 inline void ExportOnlyWorkload(std::vector<BackendId> backends)
510 {
511     using namespace armnn;
512 
513     IRuntime::CreationOptions options;
514     IRuntimePtr runtime(IRuntime::Create(options));
515 
516     // Builds up the structure of the network.
517     INetworkPtr net(INetwork::Create());
518 
519     IConnectableLayer* input = net->AddInputLayer(0);
520 
521     ActivationDescriptor descriptor;
522     descriptor.m_Function = ActivationFunction::Square;
523     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
524 
525     IConnectableLayer* output = net->AddOutputLayer(0);
526 
527     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
528     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
529 
530     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
531     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
532 
533     // optimize the network
534     OptimizerOptionsOpaque optimizedOptions;
535     optimizedOptions.SetExportEnabled(true);
536     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
537 
538     INFO("Load Network");
539     // Load it into the runtime. It should pass.
540     NetworkId netId;
541     std::string errorMessage;
542     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Malloc);
543     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
544     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
545 
546     INFO("Generate Data");
547     // Creates structures for input & output
548     std::vector<float> inputData
549     {
550         1.0f, 2.0f, 3.0f, 4.0f
551     };
552 
553     std::vector<float> outputData(4);
554 
555     std::vector<float> expectedOutput
556     {
557          1.0f, 4.0f, 9.0f, 16.0f
558     };
559 
560     INFO("Create Inference");
561 
562     InputTensors inputTensors
563     {
564         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
565     };
566     OutputTensors outputTensors
567     {
568         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
569     };
570 
571     INFO("Get Profiler");
572     runtime->GetProfiler(netId)->EnableProfiling(true);
573 
574     INFO("Run Inference");
575     // Do the inference
576     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
577 
578     INFO("Print Profiler");
579     // Retrieve the Profiler.Print() output to get the workload execution
580     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
581     std::stringstream ss;
582     profilerManager.GetProfiler()->Print(ss);
583     std::string dump = ss.str();
584 
585     // Check there is a SyncMemGeneric workload as we exported
586     INFO("Find SyncMemGeneric");
587     int count = SubStringCounter(dump, "SyncMemGeneric");
588     CHECK(count == 1);
589 
590     // Should be 1 CopyMemGeneric for the output as we did not import
591     INFO("Find CopyMemGeneric");
592     count = SubStringCounter(dump, "CopyMemGeneric");
593     CHECK(count == 1);
594 
595     // Check the output is correct
596     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
597 }
598 
ImportAndExportWorkload(std::vector<BackendId> backends)599 inline void ImportAndExportWorkload(std::vector<BackendId> backends)
600 {
601     using namespace armnn;
602 
603     IRuntime::CreationOptions options;
604     IRuntimePtr runtime(IRuntime::Create(options));
605 
606     // Builds up the structure of the network.
607     INetworkPtr net(INetwork::Create());
608 
609     IConnectableLayer* input = net->AddInputLayer(0);
610 
611     ActivationDescriptor descriptor;
612     descriptor.m_Function = ActivationFunction::Square;
613     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
614 
615     IConnectableLayer* output = net->AddOutputLayer(0);
616 
617     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
618     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
619 
620     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
621     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
622 
623     OptimizerOptionsOpaque optimizedOptions;
624     optimizedOptions.SetImportEnabled(true);
625     optimizedOptions.SetExportEnabled(true);
626     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
627 
628     INFO("Load Network");
629     // Load it into the runtime. It should pass.
630     NetworkId netId;
631     std::string errorMessage;
632     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
633     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
634     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
635 
636     INFO("Generate Data");
637     // Creates structures for input & output
638     std::vector<float> inputData
639     {
640         1.0f, 2.0f, 3.0f, 4.0f
641     };
642 
643     std::vector<float> outputData(4);
644 
645     std::vector<float> expectedOutput
646     {
647          1.0f, 4.0f, 9.0f, 16.0f
648     };
649 
650     INFO("Create inference");
651 
652     InputTensors inputTensors
653     {
654         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
655     };
656     OutputTensors outputTensors
657     {
658         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
659     };
660 
661     INFO("Get Profiler");
662     runtime->GetProfiler(netId)->EnableProfiling(true);
663 
664     INFO("Run Inference");
665     // Do the inference
666     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
667 
668     INFO("Print Profiler");
669     // Retrieve the Profiler.Print() output to get the workload execution
670     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
671     std::stringstream ss;
672     profilerManager.GetProfiler()->Print(ss);
673     std::string dump = ss.str();
674 
675     // Check there is a SyncMemGeneric workload as we exported
676     INFO("Find SyncMemGeneric");
677     int count = SubStringCounter(dump, "SyncMemGeneric");
678     CHECK(count == 1);
679 
680     // Shouldn't be any CopyMemGeneric workloads
681     INFO("Find CopyMemGeneric");
682     count = SubStringCounter(dump, "CopyMemGeneric");
683     CHECK(count == 0);
684 
685     // Check the output is correct
686     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
687 }
688 
ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)689 inline void ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)
690 {
691     using namespace armnn;
692 
693     // Create runtime in which test will run
694     IRuntime::CreationOptions options;
695     IRuntimePtr runtime(armnn::IRuntime::Create(options));
696 
697     // build up the structure of the network
698     INetworkPtr net(INetwork::Create());
699 
700     IConnectableLayer* input = net->AddInputLayer(0);
701 
702     ActivationDescriptor descriptor;
703     descriptor.m_Function = ActivationFunction::Square;
704     IConnectableLayer* activation = net->AddActivationLayer(descriptor);
705 
706     IConnectableLayer* output0 = net->AddOutputLayer(0);
707     IConnectableLayer* output1 = net->AddOutputLayer(1);
708 
709     input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
710     activation->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
711     activation->GetOutputSlot(0).Connect(output1->GetInputSlot(0));
712 
713     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32, 0.0f, 0, true));
714     activation->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32));
715 
716     // Optimize the network
717     OptimizerOptionsOpaque optimizedOptions;
718     optimizedOptions.SetImportEnabled(true);
719     optimizedOptions.SetExportEnabled(true);
720     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
721 
722     // Loads it into the runtime.
723     NetworkId netId;
724     std::string errorMessage;
725     // Enable Importing
726     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
727     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
728     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
729 
730     // Creates structures for input & output
731     std::vector<float> inputData
732     {
733         1.0f, 2.0f, 3.0f, 4.0f
734     };
735 
736     std::vector<float> outputData0(4);
737     std::vector<float> outputData1(4);
738 
739     std::vector<float> expectedOutput
740     {
741          1.0f, 4.0f, 9.0f, 16.0f
742     };
743 
744     InputTensors inputTensors
745     {
746         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
747     };
748     OutputTensors outputTensors
749     {
750         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData0.data())},
751         {1,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 1), outputData1.data())}
752     };
753 
754     // The result of the inference is not important, just the fact that there
755     // should not be CopyMemGeneric workloads.
756     runtime->GetProfiler(netId)->EnableProfiling(true);
757 
758     // Do the inference
759     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
760 
761     // Retrieve the Profiler.Print() output to get the workload execution
762     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
763     std::stringstream ss;
764     profilerManager.GetProfiler()->Print(ss);
765     std::string dump = ss.str();
766 
767     std::size_t found = std::string::npos;
768 
769     if (backends[0] == Compute::CpuRef)
770     {
771         found = dump.find("RefActivationWorkload");
772     }
773     else if (backends[0] == Compute::CpuAcc)
774     {
775         found = dump.find("NeonActivationWorkload");
776     }
777     else if (backends[0] == Compute::GpuAcc)
778     {
779         found = dump.find("ClActivationWorkload");
780     }
781 
782     CHECK(found != std::string::npos);
783     // No contains SyncMemGeneric
784     found = dump.find("SyncMemGeneric");
785     CHECK(found == std::string::npos);
786     // Contains CopyMemGeneric
787     found = dump.find("CopyMemGeneric");
788     CHECK(found != std::string::npos);
789 
790     // Check that the outputs are correct
791     CHECK(std::equal(outputData0.begin(), outputData0.end(),
792                                   expectedOutput.begin(), expectedOutput.end()));
793     CHECK(std::equal(outputData1.begin(), outputData1.end(),
794                                   expectedOutput.begin(), expectedOutput.end()));
795 }
796 
StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)797 inline void StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)
798 {
799     using namespace armnn;
800 
801     // Create runtime in which test will run
802     IRuntime::CreationOptions options;
803     IRuntimePtr runtime(armnn::IRuntime::Create(options));
804 
805     // build up the structure of the network
806     INetworkPtr net(INetwork::Create());
807 
808     IConnectableLayer* input = net->AddInputLayer(0);
809 
810     // Configure a strided slice with a stride the same size as the input but with a ShrinkAxisMask on the first
811     // dim of the output to make it too small to hold the specified slice.
812     StridedSliceDescriptor descriptor;
813     descriptor.m_Begin          = {0, 0};
814     descriptor.m_End            = {2, 3};
815     descriptor.m_Stride         = {1, 1};
816     descriptor.m_BeginMask      = 0;
817     descriptor.m_EndMask        = 0;
818     descriptor.m_ShrinkAxisMask = 1;
819     IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(descriptor);
820 
821     IConnectableLayer* output0 = net->AddOutputLayer(0);
822 
823     input->GetOutputSlot(0).Connect(stridedSlice->GetInputSlot(0));
824     stridedSlice->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
825 
826     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 2, 3 }, DataType::Float32, 0.0f, 0, true));
827     stridedSlice->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 3 }, DataType::Float32));
828 
829     // Attempt to optimize the network and check that the correct exception is thrown
830     CHECK_THROWS_AS(Optimize(*net, backends, runtime->GetDeviceSpec()), armnn::LayerValidationException);
831 }
832 
ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)833 inline void ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)
834 {
835     /**
836      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
837      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
838      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
839      * In this case all inputs and outputs should be imported
840      */
841     using namespace armnn;
842     IRuntime::CreationOptions options;
843     IRuntimePtr runtime(IRuntime::Create(options));
844 
845     // Builds up the structure of the network.
846     INetworkPtr net(INetwork::Create());
847     IConnectableLayer* input = net->AddInputLayer(0);
848     ActivationDescriptor descriptor;
849     descriptor.m_Function = ActivationFunction::Square;
850     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
851     IConnectableLayer* output = net->AddOutputLayer(0);
852     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
853     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
854     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
855     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
856     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
857     INFO("Load Network");
858 
859     // Load it into the runtime. It should pass.
860     NetworkId netId;
861     std::string errorMessage;
862     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
863     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
864     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
865 
866     INFO("Generate Data");
867 
868     // Creates structures for input & output
869     std::vector<float> inputData
870     {
871         1.0f, 2.0f, 3.0f, 4.0f
872     };
873     std::vector<float> outputData(4);
874     std::vector<float> expectedOutput
875     {
876          1.0f, 4.0f, 9.0f, 16.0f
877     };
878 
879     // Check our input and output pointers are actually aligned
880     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
881     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
882     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
883 
884     INFO("Create Inference");
885     InputTensors inputTensors
886     {
887         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
888     };
889     OutputTensors outputTensors
890     {
891         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
892     };
893 
894     runtime->GetProfiler(netId)->EnableProfiling(true);
895     std::vector<ImportedInputId> importedInputIds =
896         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
897     CHECK(importedInputIds.size() == 1);
898     std::vector<ImportedOutputId> importedOutputIds =
899         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
900     CHECK(importedOutputIds.size() == 1);
901     // Do the inference and force the import as the memory is aligned.
902     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
903 
904     // Retrieve the Profiler.Print() output to get the workload execution
905     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
906     std::stringstream ss;
907     profilerManager.GetProfiler()->Print(ss);
908     std::string dump = ss.str();
909 
910     if (backends[0] == Compute::CpuAcc)
911     {
912         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
913         // reconfigure is implemented
914         int count = SubStringCounter(dump, "SyncMemGeneric");
915         CHECK(count == 0);
916         // Should be 2 CopyMemGeneric workloads
917         count = SubStringCounter(dump, "CopyMemGeneric");
918         CHECK(count == 2);
919     }
920     else
921     {
922         // Check there is a SyncMemGeneric workload as we exported
923         int count = SubStringCounter(dump, "SyncMemGeneric");
924         CHECK(count == 1);
925         // Shouldn't be any CopyMemGeneric workloads
926         count = SubStringCounter(dump, "CopyMemGeneric");
927         CHECK(count == 0);
928     }
929     // Check the output is correct
930     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
931 }
932 
ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)933 inline void ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)
934 {
935     /**
936      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
937      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
938      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
939      * In this case all only the output should be imported
940      */
941     using namespace armnn;
942 
943     IRuntime::CreationOptions options;
944     IRuntimePtr runtime(IRuntime::Create(options));
945 
946     // Builds up the structure of the network.
947     INetworkPtr net(INetwork::Create());
948     IConnectableLayer* input = net->AddInputLayer(0);
949 
950     ActivationDescriptor descriptor;
951     descriptor.m_Function = ActivationFunction::Square;
952     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
953 
954     IConnectableLayer* output = net->AddOutputLayer(0);
955 
956     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
957     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
958     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
959     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
960 
961     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
962     INFO("Load Network");
963     // Load it into the runtime. It should pass.
964     NetworkId netId;
965     std::string errorMessage;
966     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
967     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
968     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
969 
970     INFO("Generate Data");
971 
972     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
973     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
974     auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
975 
976     float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
977 
978     // Check if our pointer is truly misaligned
979     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
980     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
981 
982     std::vector<float> inputData
983     {
984          1.0f, 2.0f, 3.0f, 4.0f
985     };
986 
987     std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
988 
989     std::vector<float> outputData(4);
990     // Check our output buffer is aligned
991     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
992 
993     std::vector<float> expectedOutput
994     {
995          1.0f, 4.0f, 9.0f, 16.0f
996     };
997 
998     INFO("Create Inference");
999     InputTensors inputTensors
1000     {
1001         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedMemPtr)},
1002     };
1003     OutputTensors outputTensors
1004     {
1005         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1006     };
1007     runtime->GetProfiler(netId)->EnableProfiling(true);
1008     std::vector<ImportedInputId> importedInputIds =
1009         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1010     // We expect the import to have failed.
1011     CHECK(importedInputIds.size() == 0);
1012     std::vector<ImportedOutputId> importedOutputIds =
1013         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1014     CHECK(importedOutputIds.size() == 1);
1015 
1016     // Do the inference and force the import as the memory is misaligned.
1017     runtime->EnqueueWorkload(netId, inputTensors, OutputTensors(), importedInputIds, importedOutputIds);
1018 
1019     // Retrieve the Profiler.Print() output to get the workload execution
1020     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1021     std::stringstream ss;
1022     profilerManager.GetProfiler()->Print(ss);
1023     std::string dump = ss.str();
1024 
1025     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1026     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1027     // for imports/copies. Only that the output is correct.
1028     if (backends[0] != Compute::GpuAcc)
1029     {
1030         if (backends[0] == Compute::CpuAcc)
1031         {
1032             // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1033             // reconfigure is implemented
1034             // We should get 0 SyncMemGeneric for the Output
1035             int count = SubStringCounter(dump, "SyncMemGeneric");
1036             CHECK(count == 0);
1037             // Should be 2 CopyMemGeneric as we copied the input
1038             count = SubStringCounter(dump, "CopyMemGeneric");
1039             CHECK(count == 2);
1040         }
1041         else
1042         {
1043             // We should get 1 SyncMemGeneric for the Output
1044             int count = SubStringCounter(dump, "SyncMemGeneric");
1045             CHECK(count == 1);
1046             // Should only be 1 CopyMemGeneric as we copied the input
1047             count = SubStringCounter(dump, "CopyMemGeneric");
1048             CHECK(count == 1);
1049         }
1050     }
1051     // Check the output is correct
1052     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1053     std::free(memPtr);
1054 }
1055 
ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)1056 inline void ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1057 {
1058     /**
1059      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1060      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1061      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1062      * In this case all only the input should be imported
1063      */
1064     using namespace armnn;
1065 
1066     IRuntime::CreationOptions options;
1067     IRuntimePtr runtime(IRuntime::Create(options));
1068 
1069     // Builds up the structure of the network.
1070     INetworkPtr net(INetwork::Create());
1071     IConnectableLayer* input = net->AddInputLayer(0);
1072 
1073     ActivationDescriptor descriptor;
1074     descriptor.m_Function = ActivationFunction::Square;
1075     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1076 
1077     IConnectableLayer* output = net->AddOutputLayer(0);
1078 
1079     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1080     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1081     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1082     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1083 
1084     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1085     INFO("Load Network");
1086     // Load it into the runtime. It should pass.
1087     NetworkId netId;
1088     std::string errorMessage;
1089     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1090     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1091     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1092 
1093     INFO("Generate Data");
1094 
1095     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1096     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1097     auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1098 
1099     float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
1100 
1101     // Check if our pointer is truly misaligned
1102     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1103     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
1104 
1105     // Creates structures for input & output
1106     std::vector<float> inputData
1107     {
1108         1.0f, 2.0f, 3.0f, 4.0f
1109     };
1110 
1111     // Check our input buffer is aligned
1112     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1113     std::vector<float> expectedOutput
1114     {
1115          1.0f, 4.0f, 9.0f, 16.0f
1116     };
1117 
1118     INFO("Create Inference");
1119     InputTensors inputTensors
1120     {
1121         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1122     };
1123     OutputTensors outputTensors
1124     {
1125         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedMemPtr)}
1126     };
1127     runtime->GetProfiler(netId)->EnableProfiling(true);
1128     std::vector<ImportedInputId> importedInputIds =
1129         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1130     CHECK(importedInputIds.size() == 1);
1131     // We expect this to fail.
1132     std::vector<ImportedOutputId> importedOutputIds =
1133         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1134     CHECK(importedOutputIds.size() == 0);
1135 
1136     // Even if importing the output failed we still expect to be able to get it to work.
1137     runtime->EnqueueWorkload(netId, InputTensors(), outputTensors, importedInputIds, importedOutputIds);
1138 
1139     // Retrieve the Profiler.Print() output to get the workload execution
1140     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1141     std::stringstream ss;
1142     profilerManager.GetProfiler()->Print(ss);
1143     std::string dump = ss.str();
1144 
1145     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1146     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1147     // for imports/copies. Only that the output is correct.
1148     if (backends[0] != Compute::GpuAcc)
1149     {
1150         // Even though we Imported the Input we still shouldn't have a SyncMemGeneric
1151         int count = SubStringCounter(dump, "SyncMemGeneric");
1152         CHECK(count == 0);
1153         // Should only be 1 CopyMemGeneric as we copied the input
1154         count = SubStringCounter(dump, "CopyMemGeneric");
1155         if (backends[0] == Compute::CpuAcc)
1156         {
1157             // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1158             // reconfigure is implemented
1159             CHECK(count == 2);
1160         }
1161         else
1162         {
1163             CHECK(count == 1);
1164         }
1165         // Check the output is correct
1166     }
1167     unsigned int index = 0;
1168     std::vector<float> outputData(expectedOutput.size(), 0);
1169     std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
1170     for (auto outputValue : expectedOutput)
1171     {
1172         CHECK(outputValue == outputData[index]);
1173         ++index;
1174     }
1175     std::free(memPtr);
1176 }
1177 
ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)1178 inline void ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1179 {
1180     /**
1181      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1182      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1183      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1184      * In this case all inputs and outputs should be copied
1185      */
1186     using namespace armnn;
1187 
1188     IRuntime::CreationOptions options;
1189     IRuntimePtr runtime(IRuntime::Create(options));
1190 
1191     // Builds up the structure of the network.
1192     INetworkPtr net(INetwork::Create());
1193     IConnectableLayer* input = net->AddInputLayer(0);
1194 
1195     ActivationDescriptor descriptor;
1196     descriptor.m_Function = ActivationFunction::Square;
1197     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1198 
1199     IConnectableLayer* output = net->AddOutputLayer(0);
1200 
1201     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1202     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1203     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1204     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1205 
1206     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1207     INFO("Load Network");
1208     // Load it into the runtime. It should pass.
1209     NetworkId netId;
1210     std::string errorMessage;
1211     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1212     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1213     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1214     INFO("Generate Data");
1215 
1216     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1217     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1218     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1219     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1220 
1221     // Check if our pointer is truly misaligned
1222     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1223     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1224     std::vector<float> inputData
1225     {
1226          1.0f, 2.0f, 3.0f, 4.0f
1227     };
1228     std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
1229 
1230     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1231     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1232 
1233     // Check if our pointer is truly misaligned
1234     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1235 
1236     std::vector<float> expectedOutput
1237     {
1238          1.0f, 4.0f, 9.0f, 16.0f
1239     };
1240 
1241     INFO("Create Inference");
1242     InputTensors inputTensors
1243     {
1244         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1245     };
1246     OutputTensors outputTensors
1247     {
1248         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1249     };
1250     runtime->GetProfiler(netId)->EnableProfiling(true);
1251     std::vector<ImportedInputId> importedInputIds =
1252         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1253     // Import should have failed.
1254     CHECK(importedInputIds.size() == 0);
1255     std::vector<ImportedOutputId> importedOutputIds =
1256         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1257     // Import should have failed.
1258     CHECK(importedOutputIds.size() == 0);
1259 
1260     // Do the inference and force the import as the memory is misaligned.
1261     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1262 
1263     // Retrieve the Profiler.Print() output to get the workload execution
1264     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1265     std::stringstream ss;
1266     profilerManager.GetProfiler()->Print(ss);
1267     std::string dump = ss.str();
1268 
1269     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1270     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1271     // for imports/copies. Only that the output is correct.
1272     if (backends[0] != Compute::GpuAcc)
1273     {
1274         // We can only copy so there should be no SyncMemGeneric
1275         int count = SubStringCounter(dump, "SyncMemGeneric");
1276         CHECK(count == 0);
1277         // Should only be CopyMemGeneric workloads as we copied all buffers
1278         count = SubStringCounter(dump, "CopyMemGeneric");
1279         CHECK(count == 2);
1280     }
1281     // Check the output is correct
1282     unsigned int index = 0;
1283     std::vector<float> outputData(expectedOutput.size(), 0);
1284     std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
1285     for (auto expectedValue : expectedOutput)
1286     {
1287         CHECK(expectedValue == outputData[index]);
1288         ++index;
1289     }
1290     std::free(inputMemPtr);
1291     std::free(outputMemPtr);
1292 }
1293 
ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)1294 inline void ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)
1295 {
1296     /**
1297      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1298      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1299      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1300      * In this we create some aligned buffers, import them into a network and validate the output and number of
1301      * SynMemGeneric/CopyMemgeneric. Then we try the same network again with misaligned buffers to make sure it falls
1302      * back to copying correctly.
1303      */
1304     using namespace armnn;
1305 
1306     IRuntime::CreationOptions options;
1307     IRuntimePtr runtime(IRuntime::Create(options));
1308 
1309     // Builds up the structure of the network.
1310     INetworkPtr net(INetwork::Create());
1311     IConnectableLayer* input = net->AddInputLayer(0);
1312 
1313     ActivationDescriptor descriptor;
1314     descriptor.m_Function = ActivationFunction::Square;
1315     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1316 
1317     IConnectableLayer* output = net->AddOutputLayer(0);
1318 
1319     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1320     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1321     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1322     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1323 
1324     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1325     INFO("Load Network");
1326     // Load it into the runtime. It should pass.
1327     NetworkId netId;
1328     std::string errorMessage;
1329     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1330     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1331     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1332     INFO("Generate Data");
1333 
1334     // Creates structures for input & output
1335     std::vector<float> inputData
1336     {
1337         1.0f, 2.0f, 3.0f, 4.0f
1338     };
1339     std::vector<float> outputData(4);
1340     std::vector<float> expectedOutput
1341     {
1342          1.0f, 4.0f, 9.0f, 16.0f
1343     };
1344 
1345     // Check our input and output pointers are actually aligned
1346     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1347     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1348     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1349 
1350     INFO("Create Inference");
1351     InputTensors inputTensors
1352     {
1353         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1354     };
1355     OutputTensors outputTensors
1356     {
1357         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1358     };
1359 
1360     runtime->GetProfiler(netId)->EnableProfiling(true);
1361     std::vector<ImportedInputId> importedInputIds =
1362         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1363     CHECK(importedInputIds.size() == 1);
1364     std::vector<ImportedOutputId> importedOutputIds =
1365         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1366     CHECK(importedOutputIds.size() == 1);
1367     // Do the inference and force the import as the memory is aligned.
1368     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1369 
1370     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1371     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1372     std::stringstream ss;
1373     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1374     std::string dump = ss.str();
1375 
1376     if (backends[0] == Compute::CpuAcc)
1377     {
1378         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1379         // reconfigure is implemented
1380         int count = SubStringCounter(dump, "SyncMemGeneric");
1381         CHECK(count == 0);
1382         // Should be 2 CopyMemGeneric workloads
1383         count = SubStringCounter(dump, "CopyMemGeneric");
1384         CHECK(count >= 1);
1385     }
1386     else
1387     {
1388         // Check there is at least 1 SyncMemGeneric workload as we exported
1389         int count = SubStringCounter(dump, "SyncMemGeneric");
1390         CHECK(count >= 1);
1391         // Shouldn't be any CopyMemGeneric workloads
1392         count = SubStringCounter(dump, "CopyMemGeneric");
1393         CHECK(count == 0);
1394     }
1395     // Check the output is correct
1396     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1397 
1398     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1399     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1400     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1401     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1402 
1403     // Check if our pointer is truly misaligned
1404     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1405 
1406     std::vector<float> inputValues
1407     {
1408          2.0f, 3.0f, 4.0f, 5.0f
1409     };
1410 
1411     std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
1412 
1413     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1414     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1415 
1416     // Check if our pointer is truly misaligned
1417     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1418 
1419     std::vector<float> expectedMisalignedOutput
1420     {
1421          4.0f, 9.0f, 16.0f, 25.0f
1422     };
1423 
1424     INFO("Create Second Inference");
1425     InputTensors inputTensorsMisaligned
1426     {
1427         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1428     };
1429     OutputTensors outputTensorsMisaligned
1430     {
1431         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1432     };
1433     importedInputIds = runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1434     // Import should fail.
1435     CHECK(importedInputIds.size() == 0);
1436     importedOutputIds = runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1437     // Import should fail.
1438     CHECK(importedOutputIds.size() == 0);
1439 
1440     // Do the inference and force the import as the memory is misaligned.
1441     runtime->EnqueueWorkload(netId,
1442                              inputTensorsMisaligned,
1443                              outputTensorsMisaligned,
1444                              importedInputIds,
1445                              importedOutputIds);
1446 
1447     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1448     // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1449     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1450     dump = ss.str();
1451 
1452     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1453     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1454     // for imports/copies. Only that the output is correct.
1455     if (backends[0] != Compute::GpuAcc)
1456     {
1457         // The SyncMemGeneric will still be in the profiling log from the first inference
1458         int count = SubStringCounter(dump, "SyncMemGeneric");
1459         CHECK(count >= 1);
1460         // We should now see CopyMemGeneric workloads as we copied all buffers
1461         count = SubStringCounter(dump, "CopyMemGeneric");
1462         CHECK(count >= 1);
1463     }
1464     // Check the output is correct
1465     unsigned int index = 0;
1466     std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
1467     std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
1468     for (auto outputValue : expectedMisalignedOutput)
1469     {
1470         CHECK(outputValue == alignedOutputData[index]);
1471         ++index;
1472     }
1473     // Clean up to avoid interfering with other tests
1474     runtime->UnloadNetwork(netId);
1475     std::free(inputMemPtr);
1476     std::free(outputMemPtr);
1477 }
1478 
1479 
ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)1480 inline void ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)
1481 {
1482     /**
1483      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1484      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1485      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1486      * In this we create some misaligned buffers, copy them into a network and validate the output and number of
1487      * SynMemGeneric/CopyMemgeneric. Then we try the same network again with aligned buffers to make sure it switches
1488      * to importing correctly.
1489      */
1490     using namespace armnn;
1491 
1492     IRuntime::CreationOptions options;
1493     IRuntimePtr runtime(IRuntime::Create(options));
1494 
1495     // Builds up the structure of the network.
1496     INetworkPtr net(INetwork::Create());
1497     IConnectableLayer* input = net->AddInputLayer(0);
1498 
1499     ActivationDescriptor descriptor;
1500     descriptor.m_Function = ActivationFunction::Square;
1501     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1502 
1503     IConnectableLayer* output = net->AddOutputLayer(0);
1504 
1505     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1506     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1507     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1508     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1509 
1510     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1511     INFO("Load Network");
1512     // Load it into the runtime. It should pass.
1513     NetworkId netId;
1514     std::string errorMessage;
1515     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1516     armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1517     CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1518     INFO("Generate Data");
1519 
1520     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1521     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1522     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1523     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1524 
1525     // Check if our pointer is truly misaligned
1526     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1527     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1528     std::vector<float> inputValues
1529     {
1530          2.0f, 3.0f, 4.0f, 5.0f
1531     };
1532     std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
1533 
1534     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1535     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1536 
1537     // Check if our pointer is truly misaligned
1538     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1539 
1540     std::vector<float> expectedMisalignedOutput
1541     {
1542          4.0f, 9.0f, 16.0f, 25.0f
1543     };
1544 
1545     INFO("Create Second Inference");
1546     InputTensors inputTensorsMisaligned
1547     {
1548         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1549     };
1550     OutputTensors outputTensorsMisaligned
1551     {
1552         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1553     };
1554     runtime->GetProfiler(netId)->EnableProfiling(true);
1555     std::vector<ImportedInputId>  importedInputIds =
1556         runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1557     // Import should fail.
1558     CHECK(importedInputIds.size() == 0);
1559     std::vector<ImportedOutputId> importedOutputIds =
1560         runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1561     // Import should fail.
1562     CHECK(importedOutputIds.size() == 0);
1563 
1564     // Do the inference and force the import as the memory is misaligned.
1565     runtime->EnqueueWorkload(netId,
1566                              inputTensorsMisaligned,
1567                              outputTensorsMisaligned,
1568                              importedInputIds,
1569                              importedOutputIds);
1570 
1571     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1572     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1573     std::stringstream ss;
1574     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1575     std::string dump = ss.str();
1576 
1577     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1578     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1579     // for imports/copies. Only that the output is correct.
1580     if (backends[0] != Compute::GpuAcc)
1581     {
1582         // We can only copy so there should be no SyncMemGeneric
1583         int count = SubStringCounter(dump, "SyncMemGeneric");
1584         CHECK(count == 0);
1585         // Should only be CopyMemGeneric workloads as we copied all buffers
1586         count = SubStringCounter(dump, "CopyMemGeneric");
1587         CHECK(count >= 1);
1588     }
1589     // Check the output is correct
1590     unsigned int index = 0;
1591     std::vector<float> alignedOutput(expectedMisalignedOutput.size());
1592     std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
1593     for (auto outputValue : expectedMisalignedOutput)
1594     {
1595         CHECK(outputValue == alignedOutput[index]);
1596         ++index;
1597     }
1598     std::free(inputMemPtr);
1599     std::free(outputMemPtr);
1600 
1601     // Creates structures for input & output
1602     std::vector<float> inputData
1603     {
1604         1.0f, 2.0f, 3.0f, 4.0f
1605     };
1606     std::vector<float> outputData(4);
1607     std::vector<float> expectedOutput
1608     {
1609          1.0f, 4.0f, 9.0f, 16.0f
1610     };
1611 
1612     // Check our input and output pointers are actually aligned
1613     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1614     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1615 
1616     INFO("Create Inference");
1617     InputTensors inputTensors
1618     {
1619         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1620     };
1621     OutputTensors outputTensors
1622     {
1623         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1624     };
1625 
1626     importedInputIds = runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1627     CHECK(importedInputIds.size() == 1);
1628     importedOutputIds = runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1629     CHECK(importedOutputIds.size() == 1);
1630     // Do the inference and force the import as the memory is aligned.
1631     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1632 
1633     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1634     // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1635     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1636     dump = ss.str();
1637 
1638     if (backends[0] == Compute::CpuAcc)
1639     {
1640         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1641         // reconfigure is implemented
1642         int count = SubStringCounter(dump, "SyncMemGeneric");
1643         CHECK(count == 0);
1644         // Should be 2 CopyMemGeneric workloads
1645         count = SubStringCounter(dump, "CopyMemGeneric");
1646         CHECK(count >= 1);
1647     }
1648     else
1649     {
1650         // Repeated inferences make it difficult to check for an accurate count. So we just validate that we have a
1651         // SyncMemGeneric Workload when we previously didn't
1652         int count = SubStringCounter(dump, "SyncMemGeneric");
1653         CHECK(count >= 1);
1654         // Should still be some CopyMemGeneric Workloads from the last inference
1655         count = SubStringCounter(dump, "CopyMemGeneric");
1656         CHECK(count >= 1);
1657     }
1658     // Check the output is correct
1659     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1660     // Clean up to avoid interfering with other tests
1661     runtime->UnloadNetwork(netId);
1662 }
1663 
1664 } // anonymous namespace
1665