1 //
2 // Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #pragma once
6
7 #include <CommonTestUtils.hpp>
8
9 #include <armnn/Descriptors.hpp>
10 #include <armnn/INetwork.hpp>
11 #include <armnn/IRuntime.hpp>
12
13 #include <Profiling.hpp>
14 #include <armnnUtils/QuantizeHelper.hpp>
15 #include <ResolveType.hpp>
16
17 #include <doctest/doctest.h>
18
19 #include <vector>
20
21 namespace
22 {
23
24 using namespace armnn;
25
26 template<typename T>
ConstantUsageTest(const std::vector<BackendId> & computeDevice,const TensorInfo & commonTensorInfo,const std::vector<T> & inputData,const std::vector<T> & constantData,const std::vector<T> & expectedOutputData)27 bool ConstantUsageTest(const std::vector<BackendId>& computeDevice,
28 const TensorInfo& commonTensorInfo,
29 const std::vector<T>& inputData,
30 const std::vector<T>& constantData,
31 const std::vector<T>& expectedOutputData)
32 {
33 // Create runtime in which test will run
34 IRuntime::CreationOptions options;
35 IRuntimePtr runtime(IRuntime::Create(options));
36
37 // Builds up the structure of the network.
38 INetworkPtr net(INetwork::Create());
39
40 IConnectableLayer* input = net->AddInputLayer(0);
41 IConnectableLayer* constant = net->AddConstantLayer(ConstTensor(commonTensorInfo, constantData));
42 ARMNN_NO_DEPRECATE_WARN_BEGIN
43 IConnectableLayer* add = net->AddAdditionLayer();
44 ARMNN_NO_DEPRECATE_WARN_END
45 IConnectableLayer* output = net->AddOutputLayer(0);
46
47 input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
48 constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
49 add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
50
51 // Sets the tensors in the network.
52 input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
53 constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
54 add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
55
56 // optimize the network
57 IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());
58
59 // Loads it into the runtime.
60 NetworkId netId;
61 std::string errorMessage;
62 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage);
63 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
64
65 // Creates structures for input & output.
66 std::vector<T> outputData(inputData.size());
67
68 InputTensors inputTensors
69 {
70 {0, ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
71 };
72 OutputTensors outputTensors
73 {
74 {0, Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
75 };
76
77 // Does the inference.
78 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
79
80 // Checks the results.
81 return outputData == expectedOutputData;
82 }
83
ConstantUsageFloat32Test(const std::vector<BackendId> & backends)84 inline bool ConstantUsageFloat32Test(const std::vector<BackendId>& backends)
85 {
86 TensorInfo commonTensorInfo({ 2, 3 }, DataType::Float32);
87 commonTensorInfo.SetConstant(true);
88
89 return ConstantUsageTest(backends,
90 commonTensorInfo,
91 std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
92 std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
93 std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f } // Expected output.
94 );
95 }
96
ConstantUsageUint8Test(const std::vector<BackendId> & backends)97 inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends)
98 {
99 TensorInfo commonTensorInfo({ 2, 3 }, DataType::QAsymmU8);
100
101 const float scale = 0.023529f;
102 const int8_t offset = -43;
103
104 commonTensorInfo.SetQuantizationScale(scale);
105 commonTensorInfo.SetQuantizationOffset(offset);
106 commonTensorInfo.SetConstant(true);
107
108 return ConstantUsageTest(backends,
109 commonTensorInfo,
110 armnnUtils::QuantizedVector<uint8_t>({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, scale, offset), // Input.
111 armnnUtils::QuantizedVector<uint8_t>({ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, scale, offset), // Const input.
112 armnnUtils::QuantizedVector<uint8_t>({ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }, scale, offset) // Expected output.
113 );
114 }
115
116 // Utility function to find the number of instances of a substring within a string.
SubStringCounter(std::string & string,std::string && substring)117 int SubStringCounter(std::string& string, std::string&& substring)
118 {
119 std::size_t found = 0;
120 int count = 0;
121 // Look for the substring starting from where we last found the substring
122 while((found = string.find(substring, found)) != std::string::npos)
123 {
124 count++;
125 // Offset by substring length to avoid finding the same substring twice
126 found += substring.length();
127 }
128 return count;
129 }
130
131 template<DataType ArmnnIType, DataType ArmnnOType,
132 typename TInput = ResolveType<ArmnnIType>, typename TOutput = ResolveType<ArmnnOType>>
EndToEndLayerTestImpl(INetworkPtr network,const std::map<int,std::vector<TInput>> & inputTensorData,const std::map<int,std::vector<TOutput>> & expectedOutputData,std::vector<BackendId> backends,float tolerance=0.000001f)133 void EndToEndLayerTestImpl(INetworkPtr network,
134 const std::map<int, std::vector<TInput>>& inputTensorData,
135 const std::map<int, std::vector<TOutput>>& expectedOutputData,
136 std::vector<BackendId> backends,
137 float tolerance = 0.000001f)
138 {
139 // Create runtime in which test will run
140 IRuntime::CreationOptions options;
141 IRuntimePtr runtime(IRuntime::Create(options));
142
143 // optimize the network
144 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
145
146 // Loads it into the runtime.
147 NetworkId netId;
148 std::string errorMessage;
149 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage);
150 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
151
152 InputTensors inputTensors;
153 inputTensors.reserve(inputTensorData.size());
154 for (auto&& it : inputTensorData)
155 {
156 inputTensors.push_back({it.first,
157 ConstTensor(runtime->GetInputTensorInfo(netId, it.first), it.second.data())});
158 }
159 OutputTensors outputTensors;
160 outputTensors.reserve(expectedOutputData.size());
161 std::map<int, std::vector<TOutput>> outputStorage;
162 for (auto&& it : expectedOutputData)
163 {
164 std::vector<TOutput> out(it.second.size());
165 outputStorage.emplace(it.first, out);
166 outputTensors.push_back({it.first,
167 Tensor(runtime->GetOutputTensorInfo(netId, it.first),
168 outputStorage.at(it.first).data())});
169 }
170
171 // Does the inference.
172 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
173
174 // Checks the results.
175 for (auto&& it : expectedOutputData)
176 {
177 std::vector<TOutput> out = outputStorage.at(it.first);
178 for (unsigned int i = 0; i < out.size(); ++i)
179 {
180 CHECK_MESSAGE(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true,
181 "Actual output: " << out[i] << ". Expected output:" << it.second[i]);
182
183 }
184 }
185 }
186
ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)187 inline void ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)
188 {
189 using namespace armnn;
190
191 // Create runtime in which test will run
192 IRuntime::CreationOptions options;
193 IRuntimePtr runtime(armnn::IRuntime::Create(options));
194
195 // build up the structure of the network
196 INetworkPtr net(INetwork::Create());
197
198 IConnectableLayer* input = net->AddInputLayer(0);
199
200 ActivationDescriptor descriptor;
201 descriptor.m_Function = ActivationFunction::Square;
202 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
203
204 IConnectableLayer* output = net->AddOutputLayer(0);
205
206 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
207 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
208
209 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
210 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
211
212 // Optimize the network
213 OptimizerOptionsOpaque optimizedOptions;
214 optimizedOptions.SetImportEnabled(true);
215 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
216 CHECK(optNet);
217
218 // Loads it into the runtime.
219 NetworkId netId;
220 std::string errorMessage;
221 // Enable Importing
222 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
223 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
224 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
225
226 // Creates structures for input & output
227 std::vector<float> inputData
228 {
229 1.0f, 2.0f, 3.0f, 4.0f
230 };
231
232 // Misaligned input
233 float* misalignedInputData = reinterpret_cast<float*>(reinterpret_cast<char*>(inputData.data()) + 1);
234
235 std::vector<float> outputData(4);
236
237 // Aligned output
238 float* alignedOutputData = outputData.data();
239
240 InputTensors inputTensors
241 {
242 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputData)},
243 };
244 OutputTensors outputTensors
245 {
246 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputData)}
247 };
248
249 runtime->GetProfiler(netId)->EnableProfiling(true);
250
251 // Do the inference and expect it to fail with a ImportMemoryException
252 CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
253 }
254
ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)255 inline void ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)
256 {
257 using namespace armnn;
258
259 // Create runtime in which test will run
260 IRuntime::CreationOptions options;
261 IRuntimePtr runtime(armnn::IRuntime::Create(options));
262
263 // build up the structure of the network
264 INetworkPtr net(INetwork::Create());
265
266 IConnectableLayer* input = net->AddInputLayer(0);
267
268 ActivationDescriptor descriptor;
269 descriptor.m_Function = ActivationFunction::Square;
270 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
271
272 IConnectableLayer* output = net->AddOutputLayer(0);
273
274 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
275 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
276
277 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
278 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
279
280 // Optimize the network
281 OptimizerOptionsOpaque optimizedOptions;
282 optimizedOptions.SetImportEnabled(true);
283 optimizedOptions.SetExportEnabled(true);
284 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
285 CHECK(optNet);
286
287 // Loads it into the runtime.
288 NetworkId netId;
289 std::string errorMessage;
290 // Enable Importing and Exporting
291 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
292 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
293 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
294
295 // Creates structures for input & output
296 std::vector<float> inputData
297 {
298 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
299 };
300
301 // Aligned input
302 float* alignedInputData = inputData.data();
303
304 std::vector<float> outputData(5);
305
306 // Misaligned output
307 float* misalignedOutputData = reinterpret_cast<float*>(reinterpret_cast<char*>(outputData.data()) + 1);
308
309 InputTensors inputTensors
310 {
311 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputData)},
312 };
313 OutputTensors outputTensors
314 {
315 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputData)}
316 };
317
318 // Do the inference and expect it to fail with a ExportMemoryException
319 if (backends[0] == Compute::CpuAcc)
320 {
321 // For CpuAcc the NeonTensorHandle will throw its own exception on misaligned memory
322 CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
323 }
324 else
325 {
326 CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryExportException);
327 }
328 }
329
ImportAlignedPointerTest(std::vector<BackendId> backends)330 inline void ImportAlignedPointerTest(std::vector<BackendId> backends)
331 {
332 using namespace armnn;
333
334 // Create runtime in which test will run
335 IRuntime::CreationOptions options;
336 IRuntimePtr runtime(armnn::IRuntime::Create(options));
337
338 // build up the structure of the network
339 INetworkPtr net(INetwork::Create());
340
341 IConnectableLayer* input = net->AddInputLayer(0);
342
343 ActivationDescriptor descriptor;
344 descriptor.m_Function = ActivationFunction::Square;
345 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
346
347 IConnectableLayer* output = net->AddOutputLayer(0);
348
349 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
350 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
351
352 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
353 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
354
355 // Optimize the network
356 OptimizerOptionsOpaque optimizedOptions;
357 optimizedOptions.SetImportEnabled(true);
358 optimizedOptions.SetExportEnabled(true);
359 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
360 CHECK(optNet);
361
362 // Loads it into the runtime.
363 NetworkId netId;
364 std::string errorMessage;
365 // Enable Importing
366 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
367 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
368 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
369
370 // Creates structures for input & output
371 std::vector<float> inputData
372 {
373 1.0f, 2.0f, 3.0f, 4.0f
374 };
375
376 std::vector<float> outputData(4);
377
378 std::vector<float> expectedOutput
379 {
380 1.0f, 4.0f, 9.0f, 16.0f
381 };
382
383 InputTensors inputTensors
384 {
385 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
386 };
387 OutputTensors outputTensors
388 {
389 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
390 };
391
392 runtime->GetProfiler(netId)->EnableProfiling(true);
393
394 // Do the inference
395 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
396
397 // Retrieve the Profiler.Print() output to get the workload execution
398 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
399 std::stringstream ss;
400 profilerManager.GetProfiler()->Print(ss);
401 std::string dump = ss.str();
402
403 // Contains ActivationWorkload
404 std::size_t found = dump.find("ActivationWorkload");
405 CHECK(found != std::string::npos);
406
407 // Contains SyncMemGeneric
408 found = dump.find("SyncMemGeneric");
409 CHECK(found != std::string::npos);
410
411 // Does not contain CopyMemGeneric
412 found = dump.find("CopyMemGeneric");
413 CHECK(found == std::string::npos);
414
415 // Check output is as expected
416 CHECK(outputData == expectedOutput);
417 }
418
ImportOnlyWorkload(std::vector<BackendId> backends)419 inline void ImportOnlyWorkload(std::vector<BackendId> backends)
420 {
421 using namespace armnn;
422
423 IRuntime::CreationOptions options;
424 IRuntimePtr runtime(IRuntime::Create(options));
425
426 // Builds up the structure of the network.
427 INetworkPtr net(INetwork::Create());
428
429 IConnectableLayer* input = net->AddInputLayer(0);
430
431 ActivationDescriptor descriptor;
432 descriptor.m_Function = ActivationFunction::Square;
433 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
434
435 IConnectableLayer* output = net->AddOutputLayer(0);
436
437 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
438 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
439
440 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
441 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
442
443 // optimize the network
444 OptimizerOptionsOpaque optimizedOptions;
445 optimizedOptions.SetImportEnabled(true);
446 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
447
448 INFO("Load Network");
449 // Load it into the runtime. It should pass.
450 NetworkId netId;
451 std::string errorMessage;
452 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
453 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
454 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
455
456 INFO("Generate Data");
457 // Creates structures for input & output
458 std::vector<float> inputData
459 {
460 1.0f, 2.0f, 3.0f, 4.0f
461 };
462
463 std::vector<float> outputData(4);
464
465 std::vector<float> expectedOutput
466 {
467 1.0f, 4.0f, 9.0f, 16.0f
468 };
469
470 INFO("Create Inference");
471
472 InputTensors inputTensors
473 {
474 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
475 };
476 OutputTensors outputTensors
477 {
478 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
479 };
480
481 INFO("Get Profiler");
482 runtime->GetProfiler(netId)->EnableProfiling(true);
483
484 INFO("Run Inference");
485 // Do the inference
486 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
487
488 INFO("Print Profiler");
489 // Retrieve the Profiler.Print() output to get the workload execution
490 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
491 std::stringstream ss;
492 profilerManager.GetProfiler()->Print(ss);
493 std::string dump = ss.str();
494
495 // Check there are no SyncMemGeneric workloads as we didn't export
496 INFO("Find SyncMemGeneric");
497 int count = SubStringCounter(dump, "SyncMemGeneric");
498 CHECK(count == 0);
499
500 // Should only be 1 CopyMemGeneric for the output as we imported
501 INFO("Find CopyMemGeneric");
502 count = SubStringCounter(dump, "CopyMemGeneric");
503 CHECK(count == 1);
504
505 // Check the output is correct
506 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
507 }
508
ExportOnlyWorkload(std::vector<BackendId> backends)509 inline void ExportOnlyWorkload(std::vector<BackendId> backends)
510 {
511 using namespace armnn;
512
513 IRuntime::CreationOptions options;
514 IRuntimePtr runtime(IRuntime::Create(options));
515
516 // Builds up the structure of the network.
517 INetworkPtr net(INetwork::Create());
518
519 IConnectableLayer* input = net->AddInputLayer(0);
520
521 ActivationDescriptor descriptor;
522 descriptor.m_Function = ActivationFunction::Square;
523 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
524
525 IConnectableLayer* output = net->AddOutputLayer(0);
526
527 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
528 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
529
530 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
531 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
532
533 // optimize the network
534 OptimizerOptionsOpaque optimizedOptions;
535 optimizedOptions.SetExportEnabled(true);
536 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
537
538 INFO("Load Network");
539 // Load it into the runtime. It should pass.
540 NetworkId netId;
541 std::string errorMessage;
542 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Malloc);
543 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
544 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
545
546 INFO("Generate Data");
547 // Creates structures for input & output
548 std::vector<float> inputData
549 {
550 1.0f, 2.0f, 3.0f, 4.0f
551 };
552
553 std::vector<float> outputData(4);
554
555 std::vector<float> expectedOutput
556 {
557 1.0f, 4.0f, 9.0f, 16.0f
558 };
559
560 INFO("Create Inference");
561
562 InputTensors inputTensors
563 {
564 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
565 };
566 OutputTensors outputTensors
567 {
568 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
569 };
570
571 INFO("Get Profiler");
572 runtime->GetProfiler(netId)->EnableProfiling(true);
573
574 INFO("Run Inference");
575 // Do the inference
576 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
577
578 INFO("Print Profiler");
579 // Retrieve the Profiler.Print() output to get the workload execution
580 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
581 std::stringstream ss;
582 profilerManager.GetProfiler()->Print(ss);
583 std::string dump = ss.str();
584
585 // Check there is a SyncMemGeneric workload as we exported
586 INFO("Find SyncMemGeneric");
587 int count = SubStringCounter(dump, "SyncMemGeneric");
588 CHECK(count == 1);
589
590 // Should be 1 CopyMemGeneric for the output as we did not import
591 INFO("Find CopyMemGeneric");
592 count = SubStringCounter(dump, "CopyMemGeneric");
593 CHECK(count == 1);
594
595 // Check the output is correct
596 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
597 }
598
ImportAndExportWorkload(std::vector<BackendId> backends)599 inline void ImportAndExportWorkload(std::vector<BackendId> backends)
600 {
601 using namespace armnn;
602
603 IRuntime::CreationOptions options;
604 IRuntimePtr runtime(IRuntime::Create(options));
605
606 // Builds up the structure of the network.
607 INetworkPtr net(INetwork::Create());
608
609 IConnectableLayer* input = net->AddInputLayer(0);
610
611 ActivationDescriptor descriptor;
612 descriptor.m_Function = ActivationFunction::Square;
613 IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
614
615 IConnectableLayer* output = net->AddOutputLayer(0);
616
617 input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
618 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
619
620 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
621 pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
622
623 OptimizerOptionsOpaque optimizedOptions;
624 optimizedOptions.SetImportEnabled(true);
625 optimizedOptions.SetExportEnabled(true);
626 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
627
628 INFO("Load Network");
629 // Load it into the runtime. It should pass.
630 NetworkId netId;
631 std::string errorMessage;
632 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
633 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
634 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
635
636 INFO("Generate Data");
637 // Creates structures for input & output
638 std::vector<float> inputData
639 {
640 1.0f, 2.0f, 3.0f, 4.0f
641 };
642
643 std::vector<float> outputData(4);
644
645 std::vector<float> expectedOutput
646 {
647 1.0f, 4.0f, 9.0f, 16.0f
648 };
649
650 INFO("Create inference");
651
652 InputTensors inputTensors
653 {
654 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
655 };
656 OutputTensors outputTensors
657 {
658 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
659 };
660
661 INFO("Get Profiler");
662 runtime->GetProfiler(netId)->EnableProfiling(true);
663
664 INFO("Run Inference");
665 // Do the inference
666 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
667
668 INFO("Print Profiler");
669 // Retrieve the Profiler.Print() output to get the workload execution
670 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
671 std::stringstream ss;
672 profilerManager.GetProfiler()->Print(ss);
673 std::string dump = ss.str();
674
675 // Check there is a SyncMemGeneric workload as we exported
676 INFO("Find SyncMemGeneric");
677 int count = SubStringCounter(dump, "SyncMemGeneric");
678 CHECK(count == 1);
679
680 // Shouldn't be any CopyMemGeneric workloads
681 INFO("Find CopyMemGeneric");
682 count = SubStringCounter(dump, "CopyMemGeneric");
683 CHECK(count == 0);
684
685 // Check the output is correct
686 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
687 }
688
ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)689 inline void ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)
690 {
691 using namespace armnn;
692
693 // Create runtime in which test will run
694 IRuntime::CreationOptions options;
695 IRuntimePtr runtime(armnn::IRuntime::Create(options));
696
697 // build up the structure of the network
698 INetworkPtr net(INetwork::Create());
699
700 IConnectableLayer* input = net->AddInputLayer(0);
701
702 ActivationDescriptor descriptor;
703 descriptor.m_Function = ActivationFunction::Square;
704 IConnectableLayer* activation = net->AddActivationLayer(descriptor);
705
706 IConnectableLayer* output0 = net->AddOutputLayer(0);
707 IConnectableLayer* output1 = net->AddOutputLayer(1);
708
709 input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
710 activation->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
711 activation->GetOutputSlot(0).Connect(output1->GetInputSlot(0));
712
713 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32, 0.0f, 0, true));
714 activation->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32));
715
716 // Optimize the network
717 OptimizerOptionsOpaque optimizedOptions;
718 optimizedOptions.SetImportEnabled(true);
719 optimizedOptions.SetExportEnabled(true);
720 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
721
722 // Loads it into the runtime.
723 NetworkId netId;
724 std::string errorMessage;
725 // Enable Importing
726 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
727 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
728 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
729
730 // Creates structures for input & output
731 std::vector<float> inputData
732 {
733 1.0f, 2.0f, 3.0f, 4.0f
734 };
735
736 std::vector<float> outputData0(4);
737 std::vector<float> outputData1(4);
738
739 std::vector<float> expectedOutput
740 {
741 1.0f, 4.0f, 9.0f, 16.0f
742 };
743
744 InputTensors inputTensors
745 {
746 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
747 };
748 OutputTensors outputTensors
749 {
750 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData0.data())},
751 {1,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 1), outputData1.data())}
752 };
753
754 // The result of the inference is not important, just the fact that there
755 // should not be CopyMemGeneric workloads.
756 runtime->GetProfiler(netId)->EnableProfiling(true);
757
758 // Do the inference
759 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
760
761 // Retrieve the Profiler.Print() output to get the workload execution
762 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
763 std::stringstream ss;
764 profilerManager.GetProfiler()->Print(ss);
765 std::string dump = ss.str();
766
767 std::size_t found = std::string::npos;
768
769 if (backends[0] == Compute::CpuRef)
770 {
771 found = dump.find("RefActivationWorkload");
772 }
773 else if (backends[0] == Compute::CpuAcc)
774 {
775 found = dump.find("NeonActivationWorkload");
776 }
777 else if (backends[0] == Compute::GpuAcc)
778 {
779 found = dump.find("ClActivationWorkload");
780 }
781
782 CHECK(found != std::string::npos);
783 // No contains SyncMemGeneric
784 found = dump.find("SyncMemGeneric");
785 CHECK(found == std::string::npos);
786 // Contains CopyMemGeneric
787 found = dump.find("CopyMemGeneric");
788 CHECK(found != std::string::npos);
789
790 // Check that the outputs are correct
791 CHECK(std::equal(outputData0.begin(), outputData0.end(),
792 expectedOutput.begin(), expectedOutput.end()));
793 CHECK(std::equal(outputData1.begin(), outputData1.end(),
794 expectedOutput.begin(), expectedOutput.end()));
795 }
796
StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)797 inline void StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)
798 {
799 using namespace armnn;
800
801 // Create runtime in which test will run
802 IRuntime::CreationOptions options;
803 IRuntimePtr runtime(armnn::IRuntime::Create(options));
804
805 // build up the structure of the network
806 INetworkPtr net(INetwork::Create());
807
808 IConnectableLayer* input = net->AddInputLayer(0);
809
810 // Configure a strided slice with a stride the same size as the input but with a ShrinkAxisMask on the first
811 // dim of the output to make it too small to hold the specified slice.
812 StridedSliceDescriptor descriptor;
813 descriptor.m_Begin = {0, 0};
814 descriptor.m_End = {2, 3};
815 descriptor.m_Stride = {1, 1};
816 descriptor.m_BeginMask = 0;
817 descriptor.m_EndMask = 0;
818 descriptor.m_ShrinkAxisMask = 1;
819 IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(descriptor);
820
821 IConnectableLayer* output0 = net->AddOutputLayer(0);
822
823 input->GetOutputSlot(0).Connect(stridedSlice->GetInputSlot(0));
824 stridedSlice->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
825
826 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 2, 3 }, DataType::Float32, 0.0f, 0, true));
827 stridedSlice->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 3 }, DataType::Float32));
828
829 // Attempt to optimize the network and check that the correct exception is thrown
830 CHECK_THROWS_AS(Optimize(*net, backends, runtime->GetDeviceSpec()), armnn::LayerValidationException);
831 }
832
ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)833 inline void ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)
834 {
835 /**
836 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
837 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
838 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
839 * In this case all inputs and outputs should be imported
840 */
841 using namespace armnn;
842 IRuntime::CreationOptions options;
843 IRuntimePtr runtime(IRuntime::Create(options));
844
845 // Builds up the structure of the network.
846 INetworkPtr net(INetwork::Create());
847 IConnectableLayer* input = net->AddInputLayer(0);
848 ActivationDescriptor descriptor;
849 descriptor.m_Function = ActivationFunction::Square;
850 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
851 IConnectableLayer* output = net->AddOutputLayer(0);
852 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
853 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
854 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
855 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
856 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
857 INFO("Load Network");
858
859 // Load it into the runtime. It should pass.
860 NetworkId netId;
861 std::string errorMessage;
862 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
863 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
864 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
865
866 INFO("Generate Data");
867
868 // Creates structures for input & output
869 std::vector<float> inputData
870 {
871 1.0f, 2.0f, 3.0f, 4.0f
872 };
873 std::vector<float> outputData(4);
874 std::vector<float> expectedOutput
875 {
876 1.0f, 4.0f, 9.0f, 16.0f
877 };
878
879 // Check our input and output pointers are actually aligned
880 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
881 CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
882 CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
883
884 INFO("Create Inference");
885 InputTensors inputTensors
886 {
887 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
888 };
889 OutputTensors outputTensors
890 {
891 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
892 };
893
894 runtime->GetProfiler(netId)->EnableProfiling(true);
895 std::vector<ImportedInputId> importedInputIds =
896 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
897 CHECK(importedInputIds.size() == 1);
898 std::vector<ImportedOutputId> importedOutputIds =
899 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
900 CHECK(importedOutputIds.size() == 1);
901 // Do the inference and force the import as the memory is aligned.
902 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
903
904 // Retrieve the Profiler.Print() output to get the workload execution
905 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
906 std::stringstream ss;
907 profilerManager.GetProfiler()->Print(ss);
908 std::string dump = ss.str();
909
910 if (backends[0] == Compute::CpuAcc)
911 {
912 // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
913 // reconfigure is implemented
914 int count = SubStringCounter(dump, "SyncMemGeneric");
915 CHECK(count == 0);
916 // Should be 2 CopyMemGeneric workloads
917 count = SubStringCounter(dump, "CopyMemGeneric");
918 CHECK(count == 2);
919 }
920 else
921 {
922 // Check there is a SyncMemGeneric workload as we exported
923 int count = SubStringCounter(dump, "SyncMemGeneric");
924 CHECK(count == 1);
925 // Shouldn't be any CopyMemGeneric workloads
926 count = SubStringCounter(dump, "CopyMemGeneric");
927 CHECK(count == 0);
928 }
929 // Check the output is correct
930 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
931 }
932
ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)933 inline void ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)
934 {
935 /**
936 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
937 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
938 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
939 * In this case all only the output should be imported
940 */
941 using namespace armnn;
942
943 IRuntime::CreationOptions options;
944 IRuntimePtr runtime(IRuntime::Create(options));
945
946 // Builds up the structure of the network.
947 INetworkPtr net(INetwork::Create());
948 IConnectableLayer* input = net->AddInputLayer(0);
949
950 ActivationDescriptor descriptor;
951 descriptor.m_Function = ActivationFunction::Square;
952 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
953
954 IConnectableLayer* output = net->AddOutputLayer(0);
955
956 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
957 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
958 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
959 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
960
961 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
962 INFO("Load Network");
963 // Load it into the runtime. It should pass.
964 NetworkId netId;
965 std::string errorMessage;
966 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
967 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
968 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
969
970 INFO("Generate Data");
971
972 // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
973 // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
974 auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
975
976 float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
977
978 // Check if our pointer is truly misaligned
979 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
980 CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
981
982 std::vector<float> inputData
983 {
984 1.0f, 2.0f, 3.0f, 4.0f
985 };
986
987 std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
988
989 std::vector<float> outputData(4);
990 // Check our output buffer is aligned
991 CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
992
993 std::vector<float> expectedOutput
994 {
995 1.0f, 4.0f, 9.0f, 16.0f
996 };
997
998 INFO("Create Inference");
999 InputTensors inputTensors
1000 {
1001 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedMemPtr)},
1002 };
1003 OutputTensors outputTensors
1004 {
1005 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1006 };
1007 runtime->GetProfiler(netId)->EnableProfiling(true);
1008 std::vector<ImportedInputId> importedInputIds =
1009 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1010 // We expect the import to have failed.
1011 CHECK(importedInputIds.size() == 0);
1012 std::vector<ImportedOutputId> importedOutputIds =
1013 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1014 CHECK(importedOutputIds.size() == 1);
1015
1016 // Do the inference and force the import as the memory is misaligned.
1017 runtime->EnqueueWorkload(netId, inputTensors, OutputTensors(), importedInputIds, importedOutputIds);
1018
1019 // Retrieve the Profiler.Print() output to get the workload execution
1020 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1021 std::stringstream ss;
1022 profilerManager.GetProfiler()->Print(ss);
1023 std::string dump = ss.str();
1024
1025 // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1026 // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1027 // for imports/copies. Only that the output is correct.
1028 if (backends[0] != Compute::GpuAcc)
1029 {
1030 if (backends[0] == Compute::CpuAcc)
1031 {
1032 // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1033 // reconfigure is implemented
1034 // We should get 0 SyncMemGeneric for the Output
1035 int count = SubStringCounter(dump, "SyncMemGeneric");
1036 CHECK(count == 0);
1037 // Should be 2 CopyMemGeneric as we copied the input
1038 count = SubStringCounter(dump, "CopyMemGeneric");
1039 CHECK(count == 2);
1040 }
1041 else
1042 {
1043 // We should get 1 SyncMemGeneric for the Output
1044 int count = SubStringCounter(dump, "SyncMemGeneric");
1045 CHECK(count == 1);
1046 // Should only be 1 CopyMemGeneric as we copied the input
1047 count = SubStringCounter(dump, "CopyMemGeneric");
1048 CHECK(count == 1);
1049 }
1050 }
1051 // Check the output is correct
1052 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1053 std::free(memPtr);
1054 }
1055
ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)1056 inline void ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1057 {
1058 /**
1059 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1060 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1061 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1062 * In this case all only the input should be imported
1063 */
1064 using namespace armnn;
1065
1066 IRuntime::CreationOptions options;
1067 IRuntimePtr runtime(IRuntime::Create(options));
1068
1069 // Builds up the structure of the network.
1070 INetworkPtr net(INetwork::Create());
1071 IConnectableLayer* input = net->AddInputLayer(0);
1072
1073 ActivationDescriptor descriptor;
1074 descriptor.m_Function = ActivationFunction::Square;
1075 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1076
1077 IConnectableLayer* output = net->AddOutputLayer(0);
1078
1079 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1080 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1081 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1082 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1083
1084 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1085 INFO("Load Network");
1086 // Load it into the runtime. It should pass.
1087 NetworkId netId;
1088 std::string errorMessage;
1089 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1090 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1091 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1092
1093 INFO("Generate Data");
1094
1095 // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1096 // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1097 auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1098
1099 float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
1100
1101 // Check if our pointer is truly misaligned
1102 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1103 CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
1104
1105 // Creates structures for input & output
1106 std::vector<float> inputData
1107 {
1108 1.0f, 2.0f, 3.0f, 4.0f
1109 };
1110
1111 // Check our input buffer is aligned
1112 CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1113 std::vector<float> expectedOutput
1114 {
1115 1.0f, 4.0f, 9.0f, 16.0f
1116 };
1117
1118 INFO("Create Inference");
1119 InputTensors inputTensors
1120 {
1121 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1122 };
1123 OutputTensors outputTensors
1124 {
1125 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedMemPtr)}
1126 };
1127 runtime->GetProfiler(netId)->EnableProfiling(true);
1128 std::vector<ImportedInputId> importedInputIds =
1129 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1130 CHECK(importedInputIds.size() == 1);
1131 // We expect this to fail.
1132 std::vector<ImportedOutputId> importedOutputIds =
1133 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1134 CHECK(importedOutputIds.size() == 0);
1135
1136 // Even if importing the output failed we still expect to be able to get it to work.
1137 runtime->EnqueueWorkload(netId, InputTensors(), outputTensors, importedInputIds, importedOutputIds);
1138
1139 // Retrieve the Profiler.Print() output to get the workload execution
1140 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1141 std::stringstream ss;
1142 profilerManager.GetProfiler()->Print(ss);
1143 std::string dump = ss.str();
1144
1145 // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1146 // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1147 // for imports/copies. Only that the output is correct.
1148 if (backends[0] != Compute::GpuAcc)
1149 {
1150 // Even though we Imported the Input we still shouldn't have a SyncMemGeneric
1151 int count = SubStringCounter(dump, "SyncMemGeneric");
1152 CHECK(count == 0);
1153 // Should only be 1 CopyMemGeneric as we copied the input
1154 count = SubStringCounter(dump, "CopyMemGeneric");
1155 if (backends[0] == Compute::CpuAcc)
1156 {
1157 // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1158 // reconfigure is implemented
1159 CHECK(count == 2);
1160 }
1161 else
1162 {
1163 CHECK(count == 1);
1164 }
1165 // Check the output is correct
1166 }
1167 unsigned int index = 0;
1168 std::vector<float> outputData(expectedOutput.size(), 0);
1169 std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
1170 for (auto outputValue : expectedOutput)
1171 {
1172 CHECK(outputValue == outputData[index]);
1173 ++index;
1174 }
1175 std::free(memPtr);
1176 }
1177
ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)1178 inline void ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1179 {
1180 /**
1181 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1182 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1183 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1184 * In this case all inputs and outputs should be copied
1185 */
1186 using namespace armnn;
1187
1188 IRuntime::CreationOptions options;
1189 IRuntimePtr runtime(IRuntime::Create(options));
1190
1191 // Builds up the structure of the network.
1192 INetworkPtr net(INetwork::Create());
1193 IConnectableLayer* input = net->AddInputLayer(0);
1194
1195 ActivationDescriptor descriptor;
1196 descriptor.m_Function = ActivationFunction::Square;
1197 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1198
1199 IConnectableLayer* output = net->AddOutputLayer(0);
1200
1201 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1202 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1203 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1204 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1205
1206 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1207 INFO("Load Network");
1208 // Load it into the runtime. It should pass.
1209 NetworkId netId;
1210 std::string errorMessage;
1211 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1212 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1213 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1214 INFO("Generate Data");
1215
1216 // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1217 // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1218 auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1219 float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1220
1221 // Check if our pointer is truly misaligned
1222 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1223 CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1224 std::vector<float> inputData
1225 {
1226 1.0f, 2.0f, 3.0f, 4.0f
1227 };
1228 std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
1229
1230 auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1231 float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1232
1233 // Check if our pointer is truly misaligned
1234 CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1235
1236 std::vector<float> expectedOutput
1237 {
1238 1.0f, 4.0f, 9.0f, 16.0f
1239 };
1240
1241 INFO("Create Inference");
1242 InputTensors inputTensors
1243 {
1244 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1245 };
1246 OutputTensors outputTensors
1247 {
1248 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1249 };
1250 runtime->GetProfiler(netId)->EnableProfiling(true);
1251 std::vector<ImportedInputId> importedInputIds =
1252 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1253 // Import should have failed.
1254 CHECK(importedInputIds.size() == 0);
1255 std::vector<ImportedOutputId> importedOutputIds =
1256 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1257 // Import should have failed.
1258 CHECK(importedOutputIds.size() == 0);
1259
1260 // Do the inference and force the import as the memory is misaligned.
1261 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1262
1263 // Retrieve the Profiler.Print() output to get the workload execution
1264 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1265 std::stringstream ss;
1266 profilerManager.GetProfiler()->Print(ss);
1267 std::string dump = ss.str();
1268
1269 // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1270 // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1271 // for imports/copies. Only that the output is correct.
1272 if (backends[0] != Compute::GpuAcc)
1273 {
1274 // We can only copy so there should be no SyncMemGeneric
1275 int count = SubStringCounter(dump, "SyncMemGeneric");
1276 CHECK(count == 0);
1277 // Should only be CopyMemGeneric workloads as we copied all buffers
1278 count = SubStringCounter(dump, "CopyMemGeneric");
1279 CHECK(count == 2);
1280 }
1281 // Check the output is correct
1282 unsigned int index = 0;
1283 std::vector<float> outputData(expectedOutput.size(), 0);
1284 std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
1285 for (auto expectedValue : expectedOutput)
1286 {
1287 CHECK(expectedValue == outputData[index]);
1288 ++index;
1289 }
1290 std::free(inputMemPtr);
1291 std::free(outputMemPtr);
1292 }
1293
ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)1294 inline void ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)
1295 {
1296 /**
1297 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1298 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1299 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1300 * In this we create some aligned buffers, import them into a network and validate the output and number of
1301 * SynMemGeneric/CopyMemgeneric. Then we try the same network again with misaligned buffers to make sure it falls
1302 * back to copying correctly.
1303 */
1304 using namespace armnn;
1305
1306 IRuntime::CreationOptions options;
1307 IRuntimePtr runtime(IRuntime::Create(options));
1308
1309 // Builds up the structure of the network.
1310 INetworkPtr net(INetwork::Create());
1311 IConnectableLayer* input = net->AddInputLayer(0);
1312
1313 ActivationDescriptor descriptor;
1314 descriptor.m_Function = ActivationFunction::Square;
1315 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1316
1317 IConnectableLayer* output = net->AddOutputLayer(0);
1318
1319 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1320 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1321 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1322 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1323
1324 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1325 INFO("Load Network");
1326 // Load it into the runtime. It should pass.
1327 NetworkId netId;
1328 std::string errorMessage;
1329 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1330 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1331 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1332 INFO("Generate Data");
1333
1334 // Creates structures for input & output
1335 std::vector<float> inputData
1336 {
1337 1.0f, 2.0f, 3.0f, 4.0f
1338 };
1339 std::vector<float> outputData(4);
1340 std::vector<float> expectedOutput
1341 {
1342 1.0f, 4.0f, 9.0f, 16.0f
1343 };
1344
1345 // Check our input and output pointers are actually aligned
1346 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1347 CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1348 CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1349
1350 INFO("Create Inference");
1351 InputTensors inputTensors
1352 {
1353 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1354 };
1355 OutputTensors outputTensors
1356 {
1357 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1358 };
1359
1360 runtime->GetProfiler(netId)->EnableProfiling(true);
1361 std::vector<ImportedInputId> importedInputIds =
1362 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1363 CHECK(importedInputIds.size() == 1);
1364 std::vector<ImportedOutputId> importedOutputIds =
1365 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1366 CHECK(importedOutputIds.size() == 1);
1367 // Do the inference and force the import as the memory is aligned.
1368 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1369
1370 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1371 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1372 std::stringstream ss;
1373 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1374 std::string dump = ss.str();
1375
1376 if (backends[0] == Compute::CpuAcc)
1377 {
1378 // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1379 // reconfigure is implemented
1380 int count = SubStringCounter(dump, "SyncMemGeneric");
1381 CHECK(count == 0);
1382 // Should be 2 CopyMemGeneric workloads
1383 count = SubStringCounter(dump, "CopyMemGeneric");
1384 CHECK(count >= 1);
1385 }
1386 else
1387 {
1388 // Check there is at least 1 SyncMemGeneric workload as we exported
1389 int count = SubStringCounter(dump, "SyncMemGeneric");
1390 CHECK(count >= 1);
1391 // Shouldn't be any CopyMemGeneric workloads
1392 count = SubStringCounter(dump, "CopyMemGeneric");
1393 CHECK(count == 0);
1394 }
1395 // Check the output is correct
1396 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1397
1398 // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1399 // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1400 auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1401 float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1402
1403 // Check if our pointer is truly misaligned
1404 CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1405
1406 std::vector<float> inputValues
1407 {
1408 2.0f, 3.0f, 4.0f, 5.0f
1409 };
1410
1411 std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
1412
1413 auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1414 float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1415
1416 // Check if our pointer is truly misaligned
1417 CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1418
1419 std::vector<float> expectedMisalignedOutput
1420 {
1421 4.0f, 9.0f, 16.0f, 25.0f
1422 };
1423
1424 INFO("Create Second Inference");
1425 InputTensors inputTensorsMisaligned
1426 {
1427 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1428 };
1429 OutputTensors outputTensorsMisaligned
1430 {
1431 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1432 };
1433 importedInputIds = runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1434 // Import should fail.
1435 CHECK(importedInputIds.size() == 0);
1436 importedOutputIds = runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1437 // Import should fail.
1438 CHECK(importedOutputIds.size() == 0);
1439
1440 // Do the inference and force the import as the memory is misaligned.
1441 runtime->EnqueueWorkload(netId,
1442 inputTensorsMisaligned,
1443 outputTensorsMisaligned,
1444 importedInputIds,
1445 importedOutputIds);
1446
1447 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1448 // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1449 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1450 dump = ss.str();
1451
1452 // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1453 // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1454 // for imports/copies. Only that the output is correct.
1455 if (backends[0] != Compute::GpuAcc)
1456 {
1457 // The SyncMemGeneric will still be in the profiling log from the first inference
1458 int count = SubStringCounter(dump, "SyncMemGeneric");
1459 CHECK(count >= 1);
1460 // We should now see CopyMemGeneric workloads as we copied all buffers
1461 count = SubStringCounter(dump, "CopyMemGeneric");
1462 CHECK(count >= 1);
1463 }
1464 // Check the output is correct
1465 unsigned int index = 0;
1466 std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
1467 std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
1468 for (auto outputValue : expectedMisalignedOutput)
1469 {
1470 CHECK(outputValue == alignedOutputData[index]);
1471 ++index;
1472 }
1473 // Clean up to avoid interfering with other tests
1474 runtime->UnloadNetwork(netId);
1475 std::free(inputMemPtr);
1476 std::free(outputMemPtr);
1477 }
1478
1479
ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)1480 inline void ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)
1481 {
1482 /**
1483 * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1484 * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1485 * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1486 * In this we create some misaligned buffers, copy them into a network and validate the output and number of
1487 * SynMemGeneric/CopyMemgeneric. Then we try the same network again with aligned buffers to make sure it switches
1488 * to importing correctly.
1489 */
1490 using namespace armnn;
1491
1492 IRuntime::CreationOptions options;
1493 IRuntimePtr runtime(IRuntime::Create(options));
1494
1495 // Builds up the structure of the network.
1496 INetworkPtr net(INetwork::Create());
1497 IConnectableLayer* input = net->AddInputLayer(0);
1498
1499 ActivationDescriptor descriptor;
1500 descriptor.m_Function = ActivationFunction::Square;
1501 IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1502
1503 IConnectableLayer* output = net->AddOutputLayer(0);
1504
1505 input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1506 activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1507 input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1508 activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1509
1510 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1511 INFO("Load Network");
1512 // Load it into the runtime. It should pass.
1513 NetworkId netId;
1514 std::string errorMessage;
1515 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1516 armnn::Status loadingStatus = runtime->LoadNetwork(netId, std::move(optNet), errorMessage, networkProperties);
1517 CHECK_MESSAGE(loadingStatus == Status::Success, errorMessage);
1518 INFO("Generate Data");
1519
1520 // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1521 // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1522 auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1523 float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1524
1525 // Check if our pointer is truly misaligned
1526 uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1527 CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1528 std::vector<float> inputValues
1529 {
1530 2.0f, 3.0f, 4.0f, 5.0f
1531 };
1532 std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
1533
1534 auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1535 float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1536
1537 // Check if our pointer is truly misaligned
1538 CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1539
1540 std::vector<float> expectedMisalignedOutput
1541 {
1542 4.0f, 9.0f, 16.0f, 25.0f
1543 };
1544
1545 INFO("Create Second Inference");
1546 InputTensors inputTensorsMisaligned
1547 {
1548 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1549 };
1550 OutputTensors outputTensorsMisaligned
1551 {
1552 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1553 };
1554 runtime->GetProfiler(netId)->EnableProfiling(true);
1555 std::vector<ImportedInputId> importedInputIds =
1556 runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1557 // Import should fail.
1558 CHECK(importedInputIds.size() == 0);
1559 std::vector<ImportedOutputId> importedOutputIds =
1560 runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1561 // Import should fail.
1562 CHECK(importedOutputIds.size() == 0);
1563
1564 // Do the inference and force the import as the memory is misaligned.
1565 runtime->EnqueueWorkload(netId,
1566 inputTensorsMisaligned,
1567 outputTensorsMisaligned,
1568 importedInputIds,
1569 importedOutputIds);
1570
1571 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1572 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1573 std::stringstream ss;
1574 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1575 std::string dump = ss.str();
1576
1577 // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1578 // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1579 // for imports/copies. Only that the output is correct.
1580 if (backends[0] != Compute::GpuAcc)
1581 {
1582 // We can only copy so there should be no SyncMemGeneric
1583 int count = SubStringCounter(dump, "SyncMemGeneric");
1584 CHECK(count == 0);
1585 // Should only be CopyMemGeneric workloads as we copied all buffers
1586 count = SubStringCounter(dump, "CopyMemGeneric");
1587 CHECK(count >= 1);
1588 }
1589 // Check the output is correct
1590 unsigned int index = 0;
1591 std::vector<float> alignedOutput(expectedMisalignedOutput.size());
1592 std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
1593 for (auto outputValue : expectedMisalignedOutput)
1594 {
1595 CHECK(outputValue == alignedOutput[index]);
1596 ++index;
1597 }
1598 std::free(inputMemPtr);
1599 std::free(outputMemPtr);
1600
1601 // Creates structures for input & output
1602 std::vector<float> inputData
1603 {
1604 1.0f, 2.0f, 3.0f, 4.0f
1605 };
1606 std::vector<float> outputData(4);
1607 std::vector<float> expectedOutput
1608 {
1609 1.0f, 4.0f, 9.0f, 16.0f
1610 };
1611
1612 // Check our input and output pointers are actually aligned
1613 CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1614 CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1615
1616 INFO("Create Inference");
1617 InputTensors inputTensors
1618 {
1619 {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1620 };
1621 OutputTensors outputTensors
1622 {
1623 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1624 };
1625
1626 importedInputIds = runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1627 CHECK(importedInputIds.size() == 1);
1628 importedOutputIds = runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1629 CHECK(importedOutputIds.size() == 1);
1630 // Do the inference and force the import as the memory is aligned.
1631 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1632
1633 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1634 // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1635 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1636 dump = ss.str();
1637
1638 if (backends[0] == Compute::CpuAcc)
1639 {
1640 // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1641 // reconfigure is implemented
1642 int count = SubStringCounter(dump, "SyncMemGeneric");
1643 CHECK(count == 0);
1644 // Should be 2 CopyMemGeneric workloads
1645 count = SubStringCounter(dump, "CopyMemGeneric");
1646 CHECK(count >= 1);
1647 }
1648 else
1649 {
1650 // Repeated inferences make it difficult to check for an accurate count. So we just validate that we have a
1651 // SyncMemGeneric Workload when we previously didn't
1652 int count = SubStringCounter(dump, "SyncMemGeneric");
1653 CHECK(count >= 1);
1654 // Should still be some CopyMemGeneric Workloads from the last inference
1655 count = SubStringCounter(dump, "CopyMemGeneric");
1656 CHECK(count >= 1);
1657 }
1658 // Check the output is correct
1659 CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1660 // Clean up to avoid interfering with other tests
1661 runtime->UnloadNetwork(netId);
1662 }
1663
1664 } // anonymous namespace
1665