• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include <CommonTestUtils.hpp>
7 #include <backendsCommon/test/mockBackend/MockImportBackend.hpp>
8 
9 #include <GraphUtils.hpp>
10 
11 #include <doctest/doctest.h>
12 
13 TEST_SUITE("NeonFallback")
14 {
15 TEST_CASE("FallbackImportToCpuAcc")
16 {
17     using namespace armnn;
18 
19     // Create a mock backend objectN
20     MockImportBackendInitialiser initialiser; // Register the Mock Backend
21     auto backendObjPtr = CreateBackendObject(MockImportBackendId());
22     CHECK((backendObjPtr != nullptr));
23 
24     BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
25     if (backendIds.find("MockRef") == backendIds.end())
26     {
27         std::string message = "Cannot load MockRef";
28         FAIL(message);
29     }
30 
31     // Create runtime in which test will run and allow fallback to CpuRef.
32     IRuntime::CreationOptions options;
33     IRuntimePtr runtime(IRuntime::Create(options));
34 
35     // Builds up the structure of the network.
36     INetworkPtr net(INetwork::Create());
37 
38     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
39     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
40     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
41     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
42     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
43     IConnectableLayer* output = net->AddOutputLayer(0, "output");
44 
45     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
46     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
47     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
48     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
49     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
50 
51     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
52 
53     input0->GetOutputSlot(0).SetTensorInfo(info);
54     input1->GetOutputSlot(0).SetTensorInfo(info);
55     input2->GetOutputSlot(0).SetTensorInfo(info);
56     add->GetOutputSlot(0).SetTensorInfo(info);
57     sub->GetOutputSlot(0).SetTensorInfo(info);
58 
59     // optimize the network
60     std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
61     OptimizerOptionsOpaque optOptions;
62     optOptions.SetImportEnabled(true);
63     optOptions.SetExportEnabled(true);
64     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
65 
66     Graph& graph = GetGraphForTesting(optNet.get());
67 
68     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
69     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
70     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
71     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
72     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
73     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
74     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
75 
76     // Checks order is valid.
77     CHECK(CheckOrder(graph, layer0, layer1));
78     CHECK(CheckOrder(graph, layer1, layer2));
79     CHECK(CheckOrder(graph, layer2, layer3));
80     CHECK(CheckOrder(graph, layer3, layer4));
81     CHECK(CheckOrder(graph, layer4, layer5));
82     CHECK(CheckOrder(graph, layer5, layer6));
83 
84     // Load it into the runtime. It should pass.
85     NetworkId netId;
86     std::string ignoredErrorMessage;
87     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
88     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
89 
90     // Creates structures for input & output
91     std::vector<float> inputData0
92     {
93         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
94     };
95     std::vector<float> inputData1
96     {
97         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
98     };
99     std::vector<float> inputData2
100     {
101         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
102     };
103 
104     std::vector<float> outputData(12);
105 
106     std::vector<float> expectedOutput
107     {
108         11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
109     };
110 
111     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
112     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
113     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
114     inputTensorInfo0.SetConstant(true);
115     inputTensorInfo1.SetConstant(true);
116     inputTensorInfo2.SetConstant(true);
117 
118     InputTensors inputTensors
119     {
120         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
121         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
122         { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) }
123     };
124     OutputTensors outputTensors
125     {
126         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
127     };
128 
129     runtime->GetProfiler(netId)->EnableProfiling(true);
130 
131     // Do the inference
132     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
133 
134     // Retrieve the Profiler.Print() output to get the workload execution
135     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
136     std::stringstream ss;
137     profilerManager.GetProfiler()->Print(ss);;
138     std::string dump = ss.str();
139 
140     // Contains ImportMemGeneric
141     std::size_t found = dump.find("ImportMemGeneric");
142     CHECK(found != std::string::npos);
143 
144     // Contains SyncMemGeneric
145     found = dump.find("SyncMemGeneric");
146     CHECK(found != std::string::npos);
147 
148     // Does not contain CopyMemGeneric
149     found = dump.find("CopyMemGeneric");
150     CHECK(found == std::string::npos);
151 
152     // Use memory import between backends
153     CHECK((layer4->GetType() == LayerType::MemImport));
154 
155     // Check output is as expected
156     CHECK(outputData == expectedOutput);
157 }
158 
159 TEST_CASE("FallbackPaddingCopyToCpuAcc")
160 {
161     using namespace armnn;
162 
163     // Create a mock backend object
164     MockImportBackendInitialiser initialiser; // Register the Mock Backend
165     auto backendObjPtr = CreateBackendObject(MockImportBackendId());
166     CHECK((backendObjPtr != nullptr));
167 
168     BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
169     if (backendIds.find("MockRef") == backendIds.end())
170     {
171         std::string message = "Cannot load MockRef";
172         FAIL(message);
173     }
174 
175     // Create runtime in which test will run and allow fallback to CpuRef.
176     IRuntime::CreationOptions options;
177     IRuntimePtr runtime(IRuntime::Create(options));
178 
179     // Builds up the structure of the network.
180     INetworkPtr net(INetwork::Create());
181 
182     Pooling2dDescriptor desc;
183 
184     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
185     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
186     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
187     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
188     IConnectableLayer* output = net->AddOutputLayer(0, "output");
189 
190     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
191     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
192     add->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
193     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
194 
195     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
196     TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
197 
198     input0->GetOutputSlot(0).SetTensorInfo(info);
199     input1->GetOutputSlot(0).SetTensorInfo(info);
200     add->GetOutputSlot(0).SetTensorInfo(info);
201     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
202 
203     // optimize the network
204     std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
205     OptimizerOptionsOpaque optOptions;
206     optOptions.SetImportEnabled(true);
207     optOptions.SetExportEnabled(true);
208     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
209 
210     Graph& graph = GetGraphForTesting(optNet.get());
211 
212     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
213     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
214     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "add");
215     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ add (0) -> pooling (0) ]");
216     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "pooling");
217     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output");
218 
219     // Checks order is valid.
220     CHECK(CheckOrder(graph, layer0, layer1));
221     CHECK(CheckOrder(graph, layer1, layer2));
222     CHECK(CheckOrder(graph, layer2, layer3));
223     CHECK(CheckOrder(graph, layer3, layer4));
224     CHECK(CheckOrder(graph, layer4, layer5));
225 
226     // Load it into the runtime. It should pass.
227     NetworkId netId;
228     std::string ignoredErrorMessage;
229     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
230 
231     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
232 
233     // Creates structures for input & output
234     std::vector<float> inputData0
235     {
236         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
237     };
238     std::vector<float> inputData1
239     {
240         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
241     };
242 
243     std::vector<float> outputData(2);
244 
245     std::vector<float> expectedOutput
246     {
247         6.0f, 12.0f
248     };
249 
250     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
251     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
252     inputTensorInfo0.SetConstant(true);
253     inputTensorInfo1.SetConstant(true);
254 
255     InputTensors inputTensors
256     {
257         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
258         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }
259     };
260     OutputTensors outputTensors
261     {
262         { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
263     };
264 
265     runtime->GetProfiler(netId)->EnableProfiling(true);
266 
267     // Do the inference
268     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
269 
270     // Retrieve the Profiler.Print() output to get the workload execution
271     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
272     std::stringstream ss;
273     profilerManager.GetProfiler()->Print(ss);;
274     std::string dump = ss.str();
275 
276     // Contains CopyMemGeneric between the backends
277     std::size_t found = dump.find("CopyMemGeneric");
278     CHECK(found != std::string::npos);
279 
280     // Contains SyncMemGeneric for the output
281     found = dump.find("SyncMemGeneric");
282     CHECK(found != std::string::npos);
283 
284     // Does not contain ImportMemGeneric
285     found = dump.find("ImportMemGeneric");
286     CHECK(found == std::string::npos);
287 
288     // Use memory import between backends
289     CHECK((layer3->GetType() == LayerType::MemCopy));
290 
291     // Check output is as expected
292     CHECK(outputData == expectedOutput);
293 }
294 
295 TEST_CASE("FallbackImportFromCpuAcc")
296 {
297     using namespace armnn;
298 
299     // Create a mock backend object
300     MockImportBackendInitialiser initialiser; // Register the Mock Backend
301     auto backendObjPtr = CreateBackendObject(MockImportBackendId());
302     CHECK((backendObjPtr != nullptr));
303 
304     BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
305     if (backendIds.find("MockRef") == backendIds.end())
306     {
307         std::string message = "Cannot load MockRef";
308         FAIL(message);
309     }
310 
311     // Create runtime in which test will run and allow fallback to CpuRef.
312     IRuntime::CreationOptions options;
313     IRuntimePtr runtime(IRuntime::Create(options));
314 
315     // Builds up the structure of the network.
316     INetworkPtr net(INetwork::Create());
317 
318     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
319     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
320     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
321     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
322     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
323     IConnectableLayer* output = net->AddOutputLayer(0, "output");
324 
325     input0->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
326     input1->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
327     input2->GetOutputSlot(0).Connect(add->GetInputSlot(0));
328     sub->GetOutputSlot(0).Connect(add->GetInputSlot(1));
329     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
330 
331     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
332 
333     input0->GetOutputSlot(0).SetTensorInfo(info);
334     input1->GetOutputSlot(0).SetTensorInfo(info);
335     input2->GetOutputSlot(0).SetTensorInfo(info);
336     sub->GetOutputSlot(0).SetTensorInfo(info);
337     add->GetOutputSlot(0).SetTensorInfo(info);
338 
339     // optimize the network
340     std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
341     OptimizerOptionsOpaque optOptions;
342     optOptions.SetImportEnabled(true);
343     optOptions.SetExportEnabled(true);
344     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
345 
346     Graph& graph = GetGraphForTesting(optNet.get());
347 
348     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
349     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
350     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
351     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "sub");
352     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ sub (0) -> add (1) ]");
353     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "add");
354     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
355 
356     // Checks order is valid.
357     CHECK(CheckOrder(graph, layer0, layer1));
358     CHECK(CheckOrder(graph, layer1, layer2));
359     CHECK(CheckOrder(graph, layer2, layer3));
360     CHECK(CheckOrder(graph, layer3, layer4));
361     CHECK(CheckOrder(graph, layer4, layer5));
362     CHECK(CheckOrder(graph, layer5, layer6));
363 
364     // Load it into the runtime. It should pass.
365     NetworkId netId;
366     std::string ignoredErrorMessage;
367 
368     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
369     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
370 
371     // Creates structures for input & output
372     std::vector<float> inputData0
373     {
374         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f
375     };
376     std::vector<float> inputData1
377     {
378         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
379     };
380     std::vector<float> inputData2
381     {
382         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
383     };
384 
385     std::vector<float> outputData(12);
386 
387     std::vector<float> expectedOutput
388     {
389         13.0f, 11.0f, 11.0f, 9.0f, 7.0f, 7.0f, 7.0f, 5.0f, 5.0f, 3.0f, 3.0f, -5.0f
390     };
391 
392     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
393     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
394     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
395     inputTensorInfo0.SetConstant(true);
396     inputTensorInfo1.SetConstant(true);
397     inputTensorInfo2.SetConstant(true);
398 
399     InputTensors inputTensors
400     {
401         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
402         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
403         { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) }
404     };
405     OutputTensors outputTensors
406     {
407         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
408     };
409 
410     runtime->GetProfiler(netId)->EnableProfiling(true);
411 
412     // Do the inference
413     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
414 
415     // Retrieve the Profiler.Print() output to get the workload execution
416     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
417     std::stringstream ss;
418     profilerManager.GetProfiler()->Print(ss);;
419     std::string dump = ss.str();
420 
421     // Contains ImportMemGeneric
422     std::size_t found = dump.find("ImportMemGeneric");
423     CHECK(found != std::string::npos);
424 
425     // Contains SyncMemGeneric
426     found = dump.find("SyncMemGeneric");
427     CHECK(found != std::string::npos);
428 
429     // Does not contain CopyMemGeneric
430     found = dump.find("CopyMemGeneric");
431     CHECK(found == std::string::npos);
432 
433     // Use memory import between backends
434     CHECK((layer4->GetType() == LayerType::MemImport));
435 
436     // Check output is as expected
437     CHECK(outputData == expectedOutput);
438 }
439 
440 TEST_CASE("FallbackPaddingCopyFromCpuAcc")
441 {
442     using namespace armnn;
443 
444     // Create a mock backend object
445     MockImportBackendInitialiser initialiser; // Register the Mock Backend
446     auto backendObjPtr = CreateBackendObject(MockImportBackendId());
447     CHECK((backendObjPtr != nullptr));
448 
449     BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
450     if (backendIds.find("MockRef") == backendIds.end())
451     {
452         std::string message = "Cannot load MockRef";
453         FAIL(message);
454     }
455 
456     // Create runtime in which test will run and allow fallback to CpuRef.
457     IRuntime::CreationOptions options;
458     IRuntimePtr runtime(IRuntime::Create(options));
459 
460     // Builds up the structure of the network.
461     INetworkPtr net(INetwork::Create());
462 
463     Pooling2dDescriptor desc;
464 
465     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
466     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
467     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
468     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
469     IConnectableLayer* output = net->AddOutputLayer(0, "output");
470 
471     input0->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
472     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
473     pooling->GetOutputSlot(0).Connect(add->GetInputSlot(0));
474     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
475 
476     TensorInfo inputInfo = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
477     TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
478 
479     input0->GetOutputSlot(0).SetTensorInfo(inputInfo);
480     input1->GetOutputSlot(0).SetTensorInfo(poolingInfo);
481     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
482     add->GetOutputSlot(0).SetTensorInfo(poolingInfo);
483 
484     // optimize the network
485     std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
486     OptimizerOptionsOpaque optOptions;
487     optOptions.SetImportEnabled(true);
488     optOptions.SetExportEnabled(true);
489     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
490 
491     Graph& graph = GetGraphForTesting(optNet.get());
492 
493     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
494     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
495     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "pooling");
496     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ pooling (0) -> add (0) ]");
497     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "add");
498     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output");
499 
500     // Checks order is valid.
501     CHECK(CheckOrder(graph, layer0, layer1));
502     CHECK(CheckOrder(graph, layer1, layer2));
503     CHECK(CheckOrder(graph, layer2, layer3));
504     CHECK(CheckOrder(graph, layer3, layer4));
505     CHECK(CheckOrder(graph, layer4, layer5));
506 
507     // Load it into the runtime. It should pass.
508     NetworkId netId;
509     std::string ignoredErrorMessage;
510     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
511 
512     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
513 
514     // Creates structures for input & output
515     std::vector<float> inputData0
516     {
517         1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f
518     };
519     std::vector<float> inputData1
520     {
521         -1.0f, 3.0f
522     };
523 
524     std::vector<float> outputData(2);
525 
526     std::vector<float> expectedOutput
527     {
528         5.0f, 15.0f
529     };
530 
531     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
532     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
533     inputTensorInfo0.SetConstant(true);
534     inputTensorInfo1.SetConstant(true);
535 
536     InputTensors inputTensors
537     {
538         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
539         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }
540     };
541     OutputTensors outputTensors
542     {
543         { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
544     };
545 
546     runtime->GetProfiler(netId)->EnableProfiling(true);
547 
548     // Do the inference
549     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
550 
551     // Retrieve the Profiler.Print() output to get the workload execution
552     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
553     std::stringstream ss;
554     profilerManager.GetProfiler()->Print(ss);;
555     std::string dump = ss.str();
556 
557     // Contains CopyMemGeneric between the backends
558     std::size_t found = dump.find("CopyMemGeneric");
559     CHECK(found != std::string::npos);
560 
561     // Contains SyncMemGeneric for the output
562     found = dump.find("SyncMemGeneric");
563     CHECK(found != std::string::npos);
564 
565     // Does not contain ImportMemGeneric
566     found = dump.find("ImportMemGeneric");
567     CHECK(found == std::string::npos);
568 
569     // Use memory import between backends
570     CHECK((layer3->GetType() == LayerType::MemCopy));
571 
572     // Check output is as expected
573     CHECK(outputData == expectedOutput);
574 }
575 
576 TEST_CASE("FallbackDisableImportFromCpuAcc")
577 {
578     using namespace armnn;
579 
580     // Create a mock backend object
581     MockImportBackendInitialiser initialiser; // Register the Mock Backend
582     auto backendObjPtr = CreateBackendObject(MockImportBackendId());
583     CHECK((backendObjPtr != nullptr));
584 
585     BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
586     if (backendIds.find("MockRef") == backendIds.end())
587     {
588         std::string message = "Cannot load MockRef";
589         FAIL(message);
590     }
591 
592     // Create runtime in which test will run and allow fallback to CpuRef.
593     IRuntime::CreationOptions options;
594     IRuntimePtr runtime(IRuntime::Create(options));
595 
596     // Builds up the structure of the network.
597     INetworkPtr net(INetwork::Create());
598 
599     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
600     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
601     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
602     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
603     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
604     IConnectableLayer* output = net->AddOutputLayer(0, "output");
605 
606     input0->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
607     input1->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
608     input2->GetOutputSlot(0).Connect(add->GetInputSlot(0));
609     sub->GetOutputSlot(0).Connect(add->GetInputSlot(1));
610     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
611 
612     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
613 
614     input0->GetOutputSlot(0).SetTensorInfo(info);
615     input1->GetOutputSlot(0).SetTensorInfo(info);
616     input2->GetOutputSlot(0).SetTensorInfo(info);
617     sub->GetOutputSlot(0).SetTensorInfo(info);
618     add->GetOutputSlot(0).SetTensorInfo(info);
619 
620     // optimize the network
621     std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
622     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
623 
624     Graph& graph = GetGraphForTesting(optNet.get());
625 
626     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
627     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
628     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
629     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "sub");
630     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ sub (0) -> add (1) ]");
631     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "add");
632     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
633 
634     // Checks order is valid.
635     CHECK(CheckOrder(graph, layer0, layer1));
636     CHECK(CheckOrder(graph, layer1, layer2));
637     CHECK(CheckOrder(graph, layer2, layer3));
638     CHECK(CheckOrder(graph, layer3, layer4));
639     CHECK(CheckOrder(graph, layer4, layer5));
640     CHECK(CheckOrder(graph, layer5, layer6));
641 
642     // Load it into the runtime. It should pass.
643     NetworkId netId;
644     std::string ignoredErrorMessage;
645     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
646 
647     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
648 
649     // Creates structures for input & output
650     std::vector<float> inputData0
651     {
652         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f
653     };
654     std::vector<float> inputData1
655     {
656         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
657     };
658     std::vector<float> inputData2
659     {
660         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
661     };
662 
663     std::vector<float> outputData(12);
664 
665     std::vector<float> expectedOutput
666     {
667         13.0f, 11.0f, 11.0f, 9.0f, 7.0f, 7.0f, 7.0f, 5.0f, 5.0f, 3.0f, 3.0f, -5.0f
668     };
669 
670     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
671     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
672     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
673     inputTensorInfo0.SetConstant(true);
674     inputTensorInfo1.SetConstant(true);
675     inputTensorInfo2.SetConstant(true);
676 
677     InputTensors inputTensors
678     {
679         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
680         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
681         { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) }
682     };
683     OutputTensors outputTensors
684     {
685         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
686     };
687 
688     runtime->GetProfiler(netId)->EnableProfiling(true);
689 
690     // Do the inference
691     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
692 
693     // Retrieve the Profiler.Print() output to get the workload execution
694     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
695     std::stringstream ss;
696     profilerManager.GetProfiler()->Print(ss);;
697     std::string dump = ss.str();
698 
699     // Contains CopyMemGeneric between the backends
700     std::size_t found = dump.find("CopyMemGeneric");
701     CHECK(found != std::string::npos);
702 
703     // Does not contain ImportMemGeneric
704     found = dump.find("ImportMemGeneric");
705     CHECK(found == std::string::npos);
706 
707     // Use memory import between backends
708     CHECK((layer4->GetType() == LayerType::MemCopy));
709 
710     // Check output is as expected
711     CHECK(outputData == expectedOutput);
712 }
713 
714 #if defined(ARMCOMPUTECL_ENABLED)
715 TEST_CASE("NeonImportEnabledFallbackToCl")
716 {
717     using namespace armnn;
718 
719     IRuntime::CreationOptions options;
720     IRuntimePtr runtime(IRuntime::Create(options));
721 
722     // Builds up the structure of the network.
723     INetworkPtr net(INetwork::Create());
724 
725     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
726     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
727     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
728     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
729     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
730     IConnectableLayer* output = net->AddOutputLayer(0, "output");
731 
732     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
733     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
734     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
735     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
736     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
737 
738     TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
739 
740     input0->GetOutputSlot(0).SetTensorInfo(info);
741     input1->GetOutputSlot(0).SetTensorInfo(info);
742     input2->GetOutputSlot(0).SetTensorInfo(info);
743     add->GetOutputSlot(0).SetTensorInfo(info);
744     sub->GetOutputSlot(0).SetTensorInfo(info);
745 
746     std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
747     // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
748     sub->BackendSelectionHint(backends[1]);
749 
750     // optimize the network
751     OptimizerOptionsOpaque optOptions;
752     optOptions.SetImportEnabled(true);
753     optOptions.SetExportEnabled(true);
754     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
755 
756     Graph& graph = GetGraphForTesting(optNet.get());
757 
758     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
759     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
760     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
761     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
762     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
763     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
764     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
765 
766     // Checks order is valid.
767     CHECK(CheckOrder(graph, layer0, layer1));
768     CHECK(CheckOrder(graph, layer1, layer2));
769     CHECK(CheckOrder(graph, layer2, layer3));
770     CHECK(CheckOrder(graph, layer3, layer4));
771     CHECK(CheckOrder(graph, layer4, layer5));
772     CHECK(CheckOrder(graph, layer5, layer6));
773 
774     // Use memory import between backends
775     CHECK((layer4->GetType() == LayerType::MemCopy));
776 
777     // Correctly use backend hint
778     CHECK((layer5->GetBackendId() == Compute::GpuAcc ));
779 
780     // Load it into the runtime. It should pass.
781     NetworkId netId;
782     std::string ignoredErrorMessage;
783 
784     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
785 
786     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
787 
788     // Creates structures for input & output
789     std::vector<float> inputData0
790     {
791         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
792     };
793     std::vector<float> inputData1
794     {
795         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
796     };
797     std::vector<float> inputData2
798     {
799         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
800     };
801 
802     std::vector<float> outputData(16);
803 
804     std::vector<float> expectedOutput
805     {
806         11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f
807     };
808 
809     // Creates structures for input & output
810     unsigned int numElements = info.GetNumElements();
811     size_t totalBytes = numElements * sizeof(float);
812 
813     // Prepare aligned data
814     const size_t alignment = 64;
815     size_t space = totalBytes + alignment + alignment;
816     auto inputData = std::make_unique<uint8_t[]>(space);
817     void* alignedInputPtr = inputData.get();
818     CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
819 
820     auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
821     std::copy(inputData2.begin(), inputData2.end(), intputPtr);
822 
823     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
824     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
825     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
826     inputTensorInfo0.SetConstant(true);
827     inputTensorInfo1.SetConstant(true);
828     inputTensorInfo2.SetConstant(true);
829 
830     InputTensors inputTensors
831     {
832         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
833         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
834         { 2, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr) }
835     };
836     OutputTensors outputTensors
837     {
838         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
839     };
840 
841     runtime->GetProfiler(netId)->EnableProfiling(true);
842 
843     // Do the inference
844     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
845 
846     // Retrieve the Profiler.Print() output to get the workload execution
847     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
848     std::stringstream ss;
849     profilerManager.GetProfiler()->Print(ss);;
850     std::string dump = ss.str();
851 
852     // Executed Subtraction using GpuAcc
853     std::size_t found = dump.find("ClSubtractionWorkload_Execute");
854     CHECK(found != std::string::npos);
855 
856     // Contain CopyMemGeneric
857     found = dump.find("CopyMemGeneric");
858     CHECK(found != std::string::npos);
859 
860     // Check output is as expected
861     for(unsigned int i = 0; i < numElements; ++i)
862     {
863         CHECK(outputData[i] == expectedOutput[i]);
864     }
865     runtime->UnloadNetwork(netId);
866 }
867 
868 TEST_CASE("NeonImportDisabledFallbackToCl")
869 {
870     using namespace armnn;
871 
872     IRuntime::CreationOptions options;
873     IRuntimePtr runtime(IRuntime::Create(options));
874 
875     // Builds up the structure of the network.
876     INetworkPtr net(INetwork::Create());
877 
878     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
879     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
880     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
881     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
882     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
883     IConnectableLayer* output = net->AddOutputLayer(0, "output");
884 
885     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
886     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
887     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
888     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
889     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
890 
891     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
892 
893     input0->GetOutputSlot(0).SetTensorInfo(info);
894     input1->GetOutputSlot(0).SetTensorInfo(info);
895     input2->GetOutputSlot(0).SetTensorInfo(info);
896     add->GetOutputSlot(0).SetTensorInfo(info);
897     sub->GetOutputSlot(0).SetTensorInfo(info);
898 
899     std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
900     // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
901     sub->BackendSelectionHint(backends[1]);
902 
903     // optimize the network
904     OptimizerOptionsOpaque optOptions;
905     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
906 
907     Graph& graph = GetGraphForTesting(optNet.get());
908 
909     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
910     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
911     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
912     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
913     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
914     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
915     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
916 
917     // Checks order is valid.
918     CHECK(CheckOrder(graph, layer0, layer1));
919     CHECK(CheckOrder(graph, layer1, layer2));
920     CHECK(CheckOrder(graph, layer2, layer3));
921     CHECK(CheckOrder(graph, layer3, layer4));
922     CHECK(CheckOrder(graph, layer4, layer5));
923     CHECK(CheckOrder(graph, layer5, layer6));
924 
925     // Use memory import between backends
926     CHECK((layer4->GetType() == LayerType::MemCopy));
927 
928     // Correctly use backend hint
929     CHECK((layer5->GetBackendId() == Compute::GpuAcc ));
930 
931     // Load it into the runtime. It should pass.
932     NetworkId netId;
933     runtime->LoadNetwork(netId, std::move(optNet));
934 
935     // Creates structures for input & output
936     std::vector<float> inputData0
937     {
938         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
939     };
940     std::vector<float> inputData1
941     {
942         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
943     };
944     std::vector<float> inputData2
945     {
946         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
947     };
948 
949     std::vector<float> outputData(12);
950 
951     std::vector<float> expectedOutput
952     {
953         11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
954     };
955 
956     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
957     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
958     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
959     inputTensorInfo0.SetConstant(true);
960     inputTensorInfo1.SetConstant(true);
961     inputTensorInfo2.SetConstant(true);
962 
963     InputTensors inputTensors
964     {
965         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
966         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
967         { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) }
968     };
969     OutputTensors outputTensors
970     {
971         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
972     };
973 
974     runtime->GetProfiler(netId)->EnableProfiling(true);
975 
976     // Do the inference
977     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
978 
979     // Retrieve the Profiler.Print() output to get the workload execution
980     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
981     std::stringstream ss;
982     profilerManager.GetProfiler()->Print(ss);;
983     std::string dump = ss.str();
984 
985     // Executed Subtraction using GpuAcc
986     std::size_t found = dump.find("ClSubtractionWorkload_Execute");
987     CHECK(found != std::string::npos);
988 
989     // Contain CopyMemGeneric
990     found = dump.find("CopyMemGeneric");
991     CHECK(found != std::string::npos);
992 
993     // Check output is as expected
994     CHECK(outputData == expectedOutput);
995 }
996 
997 TEST_CASE("NeonImportEnabledFallbackSubgraphToCl")
998 {
999     using namespace armnn;
1000 
1001     IRuntime::CreationOptions options;
1002     IRuntimePtr runtime(IRuntime::Create(options));
1003 
1004     // Builds up the structure of the network.
1005     INetworkPtr net(INetwork::Create());
1006 
1007     Pooling2dDescriptor desc;
1008     desc.m_PoolWidth = 2;
1009     desc.m_PoolHeight = 2;
1010     desc.m_StrideX = 2;
1011     desc.m_StrideY = 2;
1012 
1013     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
1014     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
1015     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
1016     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
1017     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
1018     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
1019     IConnectableLayer* output = net->AddOutputLayer(0, "output");
1020 
1021     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
1022     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
1023     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
1024     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
1025     sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
1026     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1027 
1028     TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
1029     TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32);
1030 
1031     input0->GetOutputSlot(0).SetTensorInfo(info);
1032     input1->GetOutputSlot(0).SetTensorInfo(info);
1033     input2->GetOutputSlot(0).SetTensorInfo(info);
1034     add->GetOutputSlot(0).SetTensorInfo(info);
1035     sub->GetOutputSlot(0).SetTensorInfo(info);
1036     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
1037 
1038     std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
1039     // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
1040     sub->BackendSelectionHint(backends[1]);
1041 
1042     // optimize the network
1043     OptimizerOptionsOpaque optOptions;
1044     optOptions.SetImportEnabled(true);
1045     optOptions.SetExportEnabled(true);
1046     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
1047 
1048     Graph& graph = GetGraphForTesting(optNet.get());
1049 
1050     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
1051     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
1052     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
1053     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
1054     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
1055     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
1056     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
1057     armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
1058     armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
1059 
1060     // Checks order is valid.
1061     CHECK(CheckOrder(graph, layer0, layer1));
1062     CHECK(CheckOrder(graph, layer1, layer2));
1063     CHECK(CheckOrder(graph, layer2, layer3));
1064     CHECK(CheckOrder(graph, layer3, layer4));
1065     CHECK(CheckOrder(graph, layer4, layer5));
1066     CHECK(CheckOrder(graph, layer5, layer6));
1067     CHECK(CheckOrder(graph, layer6, layer7));
1068     CHECK(CheckOrder(graph, layer7, layer8));
1069 
1070     // Use memory import between backends
1071     CHECK((layer4->GetType() == LayerType::MemCopy));
1072     CHECK((layer6->GetType() == LayerType::MemCopy));
1073 
1074     // Correctly use backend hint
1075     CHECK((layer5->GetBackendId() == Compute::GpuAcc ));
1076 
1077     // Load it into the runtime. It should pass.
1078     NetworkId netId;
1079     std::string ignoredErrorMessage;
1080 
1081     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
1082 
1083     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1084 
1085     // Creates structures for input & output
1086     std::vector<float> inputData0
1087     {
1088         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
1089     };
1090     std::vector<float> inputData1
1091     {
1092         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
1093     };
1094     std::vector<float> inputData2
1095     {
1096         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
1097     };
1098 
1099     std::vector<float> outputData(4);
1100 
1101     std::vector<float> expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f };
1102 
1103     // Prepare aligned data
1104     unsigned int numElements = info.GetNumElements();
1105     size_t totalBytes = numElements * sizeof(float);
1106     const size_t alignment = 64;
1107     size_t space = totalBytes + alignment + alignment;
1108     auto inputData = std::make_unique<uint8_t[]>(space);
1109     void* alignedInputPtr = inputData.get();
1110     CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
1111 
1112     auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
1113     std::copy(inputData2.begin(), inputData2.end(), intputPtr);
1114 
1115     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
1116     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
1117     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
1118     inputTensorInfo0.SetConstant(true);
1119     inputTensorInfo1.SetConstant(true);
1120     inputTensorInfo2.SetConstant(true);
1121 
1122     InputTensors inputTensors
1123     {
1124         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
1125         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
1126         { 2, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr) }
1127     };
1128     OutputTensors outputTensors
1129     {
1130         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
1131     };
1132 
1133     runtime->GetProfiler(netId)->EnableProfiling(true);
1134 
1135     // Do the inference
1136     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1137 
1138     // Retrieve the Profiler.Print() output to get the workload execution
1139     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1140     std::stringstream ss;
1141     profilerManager.GetProfiler()->Print(ss);;
1142     std::string dump = ss.str();
1143 
1144     // Executed Subtraction using GpuAcc
1145     std::size_t found = dump.find("ClSubtractionWorkload_Execute");
1146     CHECK(found != std::string::npos);
1147 
1148     // Correctly switch back to CpuAcc
1149     found = dump.find("NeonPooling2dWorkload_Execute");
1150     CHECK(found != std::string::npos);
1151 
1152     // Contain CopyMemGeneric
1153     found = dump.find("CopyMemGeneric");
1154     CHECK(found != std::string::npos);
1155 
1156     // Contains SyncMemGeneric for output
1157     found = dump.find("SyncMemGeneric");
1158     CHECK(found != std::string::npos);
1159 
1160     // Check output is as expected
1161     CHECK(outputData == expectedOutput);
1162     runtime->UnloadNetwork(netId);
1163 }
1164 
1165 TEST_CASE("NeonImportDisableFallbackSubgraphToCl")
1166 {
1167     using namespace armnn;
1168 
1169     IRuntime::CreationOptions options;
1170     IRuntimePtr runtime(IRuntime::Create(options));
1171 
1172     // Builds up the structure of the network.
1173     INetworkPtr net(INetwork::Create());
1174 
1175     Pooling2dDescriptor desc;
1176 
1177     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
1178     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
1179     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
1180     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
1181     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
1182     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
1183     IConnectableLayer* output = net->AddOutputLayer(0, "output");
1184 
1185     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
1186     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
1187     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
1188     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
1189     sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
1190     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1191 
1192     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
1193     TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
1194 
1195     input0->GetOutputSlot(0).SetTensorInfo(info);
1196     input1->GetOutputSlot(0).SetTensorInfo(info);
1197     input2->GetOutputSlot(0).SetTensorInfo(info);
1198     add->GetOutputSlot(0).SetTensorInfo(info);
1199     sub->GetOutputSlot(0).SetTensorInfo(info);
1200     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
1201 
1202     std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
1203     // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
1204     sub->BackendSelectionHint(backends[1]);
1205 
1206     // optimize the network
1207     OptimizerOptionsOpaque optOptions;
1208     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
1209 
1210     Graph& graph = GetGraphForTesting(optNet.get());
1211 
1212     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
1213     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
1214     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
1215     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
1216     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
1217     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
1218     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
1219     armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
1220     armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
1221 
1222     // Checks order is valid.
1223     CHECK(CheckOrder(graph, layer0, layer1));
1224     CHECK(CheckOrder(graph, layer1, layer2));
1225     CHECK(CheckOrder(graph, layer2, layer3));
1226     CHECK(CheckOrder(graph, layer3, layer4));
1227     CHECK(CheckOrder(graph, layer4, layer5));
1228     CHECK(CheckOrder(graph, layer5, layer6));
1229     CHECK(CheckOrder(graph, layer6, layer7));
1230     CHECK(CheckOrder(graph, layer7, layer8));
1231 
1232     // Use memory import between backends
1233     CHECK((layer4->GetType() == LayerType::MemCopy));
1234     CHECK((layer6->GetType() == LayerType::MemCopy));
1235 
1236     // Correctly use backend hint
1237     CHECK((layer5->GetBackendId() == Compute::GpuAcc ));
1238 
1239     // Load it into the runtime. It should pass.
1240     NetworkId netId;
1241     runtime->LoadNetwork(netId, std::move(optNet));
1242 
1243     // Creates structures for input & output
1244     std::vector<float> inputData0
1245     {
1246         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
1247     };
1248     std::vector<float> inputData1
1249     {
1250         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
1251     };
1252     std::vector<float> inputData2
1253     {
1254         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
1255     };
1256 
1257     std::vector<float> outputData(2);
1258 
1259     std::vector<float> expectedOutput{ 11.0f, -1.0f };
1260 
1261     armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0);
1262     armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1);
1263     armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2);
1264     inputTensorInfo0.SetConstant(true);
1265     inputTensorInfo1.SetConstant(true);
1266     inputTensorInfo2.SetConstant(true);
1267 
1268     InputTensors inputTensors
1269     {
1270         { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) },
1271         { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) },
1272         { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) }
1273     };
1274     OutputTensors outputTensors
1275     {
1276         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
1277     };
1278 
1279     runtime->GetProfiler(netId)->EnableProfiling(true);
1280 
1281     // Do the inference
1282     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1283 
1284     // Retrieve the Profiler.Print() output to get the workload execution
1285     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1286     std::stringstream ss;
1287     profilerManager.GetProfiler()->Print(ss);;
1288     std::string dump = ss.str();
1289 
1290     // Executed Subtraction using GpuAcc
1291     std::size_t found = dump.find("ClSubtractionWorkload_Execute");
1292     CHECK(found != std::string::npos);
1293 
1294     // Correctly switch back to CpuAcc
1295     found = dump.find("NeonPooling2dWorkload_Execute");
1296     CHECK(found != std::string::npos);
1297 
1298     // Contain CopyMemGeneric
1299     found = dump.find("CopyMemGeneric");
1300     CHECK(found != std::string::npos);
1301 
1302     // Check output is as expected
1303     CHECK(outputData == expectedOutput);
1304 }
1305 #endif
1306 
1307 }
1308