1 // 2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved. 3 // SPDX-License-Identifier: MIT 4 // 5 6 #include <CommonTestUtils.hpp> 7 #include <backendsCommon/test/mockBackend/MockImportBackend.hpp> 8 9 #include <GraphUtils.hpp> 10 11 #include <doctest/doctest.h> 12 13 TEST_SUITE("NeonFallback") 14 { 15 TEST_CASE("FallbackImportToCpuAcc") 16 { 17 using namespace armnn; 18 19 // Create a mock backend objectN 20 MockImportBackendInitialiser initialiser; // Register the Mock Backend 21 auto backendObjPtr = CreateBackendObject(MockImportBackendId()); 22 CHECK((backendObjPtr != nullptr)); 23 24 BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds(); 25 if (backendIds.find("MockRef") == backendIds.end()) 26 { 27 std::string message = "Cannot load MockRef"; 28 FAIL(message); 29 } 30 31 // Create runtime in which test will run and allow fallback to CpuRef. 32 IRuntime::CreationOptions options; 33 IRuntimePtr runtime(IRuntime::Create(options)); 34 35 // Builds up the structure of the network. 36 INetworkPtr net(INetwork::Create()); 37 38 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 39 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 40 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 41 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 42 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 43 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 44 45 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 46 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 47 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 48 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 49 sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 50 51 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 52 53 input0->GetOutputSlot(0).SetTensorInfo(info); 54 input1->GetOutputSlot(0).SetTensorInfo(info); 55 input2->GetOutputSlot(0).SetTensorInfo(info); 56 add->GetOutputSlot(0).SetTensorInfo(info); 57 sub->GetOutputSlot(0).SetTensorInfo(info); 58 59 // optimize the network 60 std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc }; 61 OptimizerOptionsOpaque optOptions; 62 optOptions.SetImportEnabled(true); 63 optOptions.SetExportEnabled(true); 64 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 65 66 Graph& graph = GetGraphForTesting(optNet.get()); 67 68 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 69 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 70 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 71 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 72 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 73 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 74 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 75 76 // Checks order is valid. 77 CHECK(CheckOrder(graph, layer0, layer1)); 78 CHECK(CheckOrder(graph, layer1, layer2)); 79 CHECK(CheckOrder(graph, layer2, layer3)); 80 CHECK(CheckOrder(graph, layer3, layer4)); 81 CHECK(CheckOrder(graph, layer4, layer5)); 82 CHECK(CheckOrder(graph, layer5, layer6)); 83 84 // Load it into the runtime. It should pass. 85 NetworkId netId; 86 std::string ignoredErrorMessage; 87 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 88 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 89 90 // Creates structures for input & output 91 std::vector<float> inputData0 92 { 93 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 94 }; 95 std::vector<float> inputData1 96 { 97 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 98 }; 99 std::vector<float> inputData2 100 { 101 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 102 }; 103 104 std::vector<float> outputData(12); 105 106 std::vector<float> expectedOutput 107 { 108 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f 109 }; 110 111 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 112 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 113 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 114 inputTensorInfo0.SetConstant(true); 115 inputTensorInfo1.SetConstant(true); 116 inputTensorInfo2.SetConstant(true); 117 118 InputTensors inputTensors 119 { 120 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 121 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 122 { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) } 123 }; 124 OutputTensors outputTensors 125 { 126 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 127 }; 128 129 runtime->GetProfiler(netId)->EnableProfiling(true); 130 131 // Do the inference 132 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 133 134 // Retrieve the Profiler.Print() output to get the workload execution 135 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 136 std::stringstream ss; 137 profilerManager.GetProfiler()->Print(ss);; 138 std::string dump = ss.str(); 139 140 // Contains ImportMemGeneric 141 std::size_t found = dump.find("ImportMemGeneric"); 142 CHECK(found != std::string::npos); 143 144 // Contains SyncMemGeneric 145 found = dump.find("SyncMemGeneric"); 146 CHECK(found != std::string::npos); 147 148 // Does not contain CopyMemGeneric 149 found = dump.find("CopyMemGeneric"); 150 CHECK(found == std::string::npos); 151 152 // Use memory import between backends 153 CHECK((layer4->GetType() == LayerType::MemImport)); 154 155 // Check output is as expected 156 CHECK(outputData == expectedOutput); 157 } 158 159 TEST_CASE("FallbackPaddingCopyToCpuAcc") 160 { 161 using namespace armnn; 162 163 // Create a mock backend object 164 MockImportBackendInitialiser initialiser; // Register the Mock Backend 165 auto backendObjPtr = CreateBackendObject(MockImportBackendId()); 166 CHECK((backendObjPtr != nullptr)); 167 168 BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds(); 169 if (backendIds.find("MockRef") == backendIds.end()) 170 { 171 std::string message = "Cannot load MockRef"; 172 FAIL(message); 173 } 174 175 // Create runtime in which test will run and allow fallback to CpuRef. 176 IRuntime::CreationOptions options; 177 IRuntimePtr runtime(IRuntime::Create(options)); 178 179 // Builds up the structure of the network. 180 INetworkPtr net(INetwork::Create()); 181 182 Pooling2dDescriptor desc; 183 184 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 185 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 186 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 187 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 188 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 189 190 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 191 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 192 add->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 193 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 194 195 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 196 TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); 197 198 input0->GetOutputSlot(0).SetTensorInfo(info); 199 input1->GetOutputSlot(0).SetTensorInfo(info); 200 add->GetOutputSlot(0).SetTensorInfo(info); 201 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 202 203 // optimize the network 204 std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc }; 205 OptimizerOptionsOpaque optOptions; 206 optOptions.SetImportEnabled(true); 207 optOptions.SetExportEnabled(true); 208 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 209 210 Graph& graph = GetGraphForTesting(optNet.get()); 211 212 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 213 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 214 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "add"); 215 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ add (0) -> pooling (0) ]"); 216 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "pooling"); 217 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output"); 218 219 // Checks order is valid. 220 CHECK(CheckOrder(graph, layer0, layer1)); 221 CHECK(CheckOrder(graph, layer1, layer2)); 222 CHECK(CheckOrder(graph, layer2, layer3)); 223 CHECK(CheckOrder(graph, layer3, layer4)); 224 CHECK(CheckOrder(graph, layer4, layer5)); 225 226 // Load it into the runtime. It should pass. 227 NetworkId netId; 228 std::string ignoredErrorMessage; 229 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 230 231 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 232 233 // Creates structures for input & output 234 std::vector<float> inputData0 235 { 236 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 237 }; 238 std::vector<float> inputData1 239 { 240 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 241 }; 242 243 std::vector<float> outputData(2); 244 245 std::vector<float> expectedOutput 246 { 247 6.0f, 12.0f 248 }; 249 250 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 251 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 252 inputTensorInfo0.SetConstant(true); 253 inputTensorInfo1.SetConstant(true); 254 255 InputTensors inputTensors 256 { 257 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 258 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) } 259 }; 260 OutputTensors outputTensors 261 { 262 { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 263 }; 264 265 runtime->GetProfiler(netId)->EnableProfiling(true); 266 267 // Do the inference 268 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 269 270 // Retrieve the Profiler.Print() output to get the workload execution 271 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 272 std::stringstream ss; 273 profilerManager.GetProfiler()->Print(ss);; 274 std::string dump = ss.str(); 275 276 // Contains CopyMemGeneric between the backends 277 std::size_t found = dump.find("CopyMemGeneric"); 278 CHECK(found != std::string::npos); 279 280 // Contains SyncMemGeneric for the output 281 found = dump.find("SyncMemGeneric"); 282 CHECK(found != std::string::npos); 283 284 // Does not contain ImportMemGeneric 285 found = dump.find("ImportMemGeneric"); 286 CHECK(found == std::string::npos); 287 288 // Use memory import between backends 289 CHECK((layer3->GetType() == LayerType::MemCopy)); 290 291 // Check output is as expected 292 CHECK(outputData == expectedOutput); 293 } 294 295 TEST_CASE("FallbackImportFromCpuAcc") 296 { 297 using namespace armnn; 298 299 // Create a mock backend object 300 MockImportBackendInitialiser initialiser; // Register the Mock Backend 301 auto backendObjPtr = CreateBackendObject(MockImportBackendId()); 302 CHECK((backendObjPtr != nullptr)); 303 304 BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds(); 305 if (backendIds.find("MockRef") == backendIds.end()) 306 { 307 std::string message = "Cannot load MockRef"; 308 FAIL(message); 309 } 310 311 // Create runtime in which test will run and allow fallback to CpuRef. 312 IRuntime::CreationOptions options; 313 IRuntimePtr runtime(IRuntime::Create(options)); 314 315 // Builds up the structure of the network. 316 INetworkPtr net(INetwork::Create()); 317 318 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 319 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 320 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 321 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 322 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 323 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 324 325 input0->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 326 input1->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 327 input2->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 328 sub->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 329 add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 330 331 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 332 333 input0->GetOutputSlot(0).SetTensorInfo(info); 334 input1->GetOutputSlot(0).SetTensorInfo(info); 335 input2->GetOutputSlot(0).SetTensorInfo(info); 336 sub->GetOutputSlot(0).SetTensorInfo(info); 337 add->GetOutputSlot(0).SetTensorInfo(info); 338 339 // optimize the network 340 std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc }; 341 OptimizerOptionsOpaque optOptions; 342 optOptions.SetImportEnabled(true); 343 optOptions.SetExportEnabled(true); 344 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 345 346 Graph& graph = GetGraphForTesting(optNet.get()); 347 348 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 349 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 350 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 351 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "sub"); 352 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ sub (0) -> add (1) ]"); 353 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "add"); 354 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 355 356 // Checks order is valid. 357 CHECK(CheckOrder(graph, layer0, layer1)); 358 CHECK(CheckOrder(graph, layer1, layer2)); 359 CHECK(CheckOrder(graph, layer2, layer3)); 360 CHECK(CheckOrder(graph, layer3, layer4)); 361 CHECK(CheckOrder(graph, layer4, layer5)); 362 CHECK(CheckOrder(graph, layer5, layer6)); 363 364 // Load it into the runtime. It should pass. 365 NetworkId netId; 366 std::string ignoredErrorMessage; 367 368 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 369 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 370 371 // Creates structures for input & output 372 std::vector<float> inputData0 373 { 374 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f 375 }; 376 std::vector<float> inputData1 377 { 378 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 379 }; 380 std::vector<float> inputData2 381 { 382 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 383 }; 384 385 std::vector<float> outputData(12); 386 387 std::vector<float> expectedOutput 388 { 389 13.0f, 11.0f, 11.0f, 9.0f, 7.0f, 7.0f, 7.0f, 5.0f, 5.0f, 3.0f, 3.0f, -5.0f 390 }; 391 392 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 393 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 394 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 395 inputTensorInfo0.SetConstant(true); 396 inputTensorInfo1.SetConstant(true); 397 inputTensorInfo2.SetConstant(true); 398 399 InputTensors inputTensors 400 { 401 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 402 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 403 { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) } 404 }; 405 OutputTensors outputTensors 406 { 407 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 408 }; 409 410 runtime->GetProfiler(netId)->EnableProfiling(true); 411 412 // Do the inference 413 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 414 415 // Retrieve the Profiler.Print() output to get the workload execution 416 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 417 std::stringstream ss; 418 profilerManager.GetProfiler()->Print(ss);; 419 std::string dump = ss.str(); 420 421 // Contains ImportMemGeneric 422 std::size_t found = dump.find("ImportMemGeneric"); 423 CHECK(found != std::string::npos); 424 425 // Contains SyncMemGeneric 426 found = dump.find("SyncMemGeneric"); 427 CHECK(found != std::string::npos); 428 429 // Does not contain CopyMemGeneric 430 found = dump.find("CopyMemGeneric"); 431 CHECK(found == std::string::npos); 432 433 // Use memory import between backends 434 CHECK((layer4->GetType() == LayerType::MemImport)); 435 436 // Check output is as expected 437 CHECK(outputData == expectedOutput); 438 } 439 440 TEST_CASE("FallbackPaddingCopyFromCpuAcc") 441 { 442 using namespace armnn; 443 444 // Create a mock backend object 445 MockImportBackendInitialiser initialiser; // Register the Mock Backend 446 auto backendObjPtr = CreateBackendObject(MockImportBackendId()); 447 CHECK((backendObjPtr != nullptr)); 448 449 BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds(); 450 if (backendIds.find("MockRef") == backendIds.end()) 451 { 452 std::string message = "Cannot load MockRef"; 453 FAIL(message); 454 } 455 456 // Create runtime in which test will run and allow fallback to CpuRef. 457 IRuntime::CreationOptions options; 458 IRuntimePtr runtime(IRuntime::Create(options)); 459 460 // Builds up the structure of the network. 461 INetworkPtr net(INetwork::Create()); 462 463 Pooling2dDescriptor desc; 464 465 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 466 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 467 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 468 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 469 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 470 471 input0->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 472 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 473 pooling->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 474 add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 475 476 TensorInfo inputInfo = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 477 TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); 478 479 input0->GetOutputSlot(0).SetTensorInfo(inputInfo); 480 input1->GetOutputSlot(0).SetTensorInfo(poolingInfo); 481 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 482 add->GetOutputSlot(0).SetTensorInfo(poolingInfo); 483 484 // optimize the network 485 std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc }; 486 OptimizerOptionsOpaque optOptions; 487 optOptions.SetImportEnabled(true); 488 optOptions.SetExportEnabled(true); 489 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 490 491 Graph& graph = GetGraphForTesting(optNet.get()); 492 493 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 494 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 495 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "pooling"); 496 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ pooling (0) -> add (0) ]"); 497 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "add"); 498 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output"); 499 500 // Checks order is valid. 501 CHECK(CheckOrder(graph, layer0, layer1)); 502 CHECK(CheckOrder(graph, layer1, layer2)); 503 CHECK(CheckOrder(graph, layer2, layer3)); 504 CHECK(CheckOrder(graph, layer3, layer4)); 505 CHECK(CheckOrder(graph, layer4, layer5)); 506 507 // Load it into the runtime. It should pass. 508 NetworkId netId; 509 std::string ignoredErrorMessage; 510 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 511 512 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 513 514 // Creates structures for input & output 515 std::vector<float> inputData0 516 { 517 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f 518 }; 519 std::vector<float> inputData1 520 { 521 -1.0f, 3.0f 522 }; 523 524 std::vector<float> outputData(2); 525 526 std::vector<float> expectedOutput 527 { 528 5.0f, 15.0f 529 }; 530 531 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 532 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 533 inputTensorInfo0.SetConstant(true); 534 inputTensorInfo1.SetConstant(true); 535 536 InputTensors inputTensors 537 { 538 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 539 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) } 540 }; 541 OutputTensors outputTensors 542 { 543 { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 544 }; 545 546 runtime->GetProfiler(netId)->EnableProfiling(true); 547 548 // Do the inference 549 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 550 551 // Retrieve the Profiler.Print() output to get the workload execution 552 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 553 std::stringstream ss; 554 profilerManager.GetProfiler()->Print(ss);; 555 std::string dump = ss.str(); 556 557 // Contains CopyMemGeneric between the backends 558 std::size_t found = dump.find("CopyMemGeneric"); 559 CHECK(found != std::string::npos); 560 561 // Contains SyncMemGeneric for the output 562 found = dump.find("SyncMemGeneric"); 563 CHECK(found != std::string::npos); 564 565 // Does not contain ImportMemGeneric 566 found = dump.find("ImportMemGeneric"); 567 CHECK(found == std::string::npos); 568 569 // Use memory import between backends 570 CHECK((layer3->GetType() == LayerType::MemCopy)); 571 572 // Check output is as expected 573 CHECK(outputData == expectedOutput); 574 } 575 576 TEST_CASE("FallbackDisableImportFromCpuAcc") 577 { 578 using namespace armnn; 579 580 // Create a mock backend object 581 MockImportBackendInitialiser initialiser; // Register the Mock Backend 582 auto backendObjPtr = CreateBackendObject(MockImportBackendId()); 583 CHECK((backendObjPtr != nullptr)); 584 585 BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds(); 586 if (backendIds.find("MockRef") == backendIds.end()) 587 { 588 std::string message = "Cannot load MockRef"; 589 FAIL(message); 590 } 591 592 // Create runtime in which test will run and allow fallback to CpuRef. 593 IRuntime::CreationOptions options; 594 IRuntimePtr runtime(IRuntime::Create(options)); 595 596 // Builds up the structure of the network. 597 INetworkPtr net(INetwork::Create()); 598 599 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 600 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 601 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 602 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 603 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 604 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 605 606 input0->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 607 input1->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 608 input2->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 609 sub->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 610 add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 611 612 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 613 614 input0->GetOutputSlot(0).SetTensorInfo(info); 615 input1->GetOutputSlot(0).SetTensorInfo(info); 616 input2->GetOutputSlot(0).SetTensorInfo(info); 617 sub->GetOutputSlot(0).SetTensorInfo(info); 618 add->GetOutputSlot(0).SetTensorInfo(info); 619 620 // optimize the network 621 std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc }; 622 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); 623 624 Graph& graph = GetGraphForTesting(optNet.get()); 625 626 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 627 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 628 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 629 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "sub"); 630 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ sub (0) -> add (1) ]"); 631 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "add"); 632 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 633 634 // Checks order is valid. 635 CHECK(CheckOrder(graph, layer0, layer1)); 636 CHECK(CheckOrder(graph, layer1, layer2)); 637 CHECK(CheckOrder(graph, layer2, layer3)); 638 CHECK(CheckOrder(graph, layer3, layer4)); 639 CHECK(CheckOrder(graph, layer4, layer5)); 640 CHECK(CheckOrder(graph, layer5, layer6)); 641 642 // Load it into the runtime. It should pass. 643 NetworkId netId; 644 std::string ignoredErrorMessage; 645 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 646 647 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 648 649 // Creates structures for input & output 650 std::vector<float> inputData0 651 { 652 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f 653 }; 654 std::vector<float> inputData1 655 { 656 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 657 }; 658 std::vector<float> inputData2 659 { 660 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 661 }; 662 663 std::vector<float> outputData(12); 664 665 std::vector<float> expectedOutput 666 { 667 13.0f, 11.0f, 11.0f, 9.0f, 7.0f, 7.0f, 7.0f, 5.0f, 5.0f, 3.0f, 3.0f, -5.0f 668 }; 669 670 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 671 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 672 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 673 inputTensorInfo0.SetConstant(true); 674 inputTensorInfo1.SetConstant(true); 675 inputTensorInfo2.SetConstant(true); 676 677 InputTensors inputTensors 678 { 679 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 680 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 681 { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) } 682 }; 683 OutputTensors outputTensors 684 { 685 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 686 }; 687 688 runtime->GetProfiler(netId)->EnableProfiling(true); 689 690 // Do the inference 691 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 692 693 // Retrieve the Profiler.Print() output to get the workload execution 694 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 695 std::stringstream ss; 696 profilerManager.GetProfiler()->Print(ss);; 697 std::string dump = ss.str(); 698 699 // Contains CopyMemGeneric between the backends 700 std::size_t found = dump.find("CopyMemGeneric"); 701 CHECK(found != std::string::npos); 702 703 // Does not contain ImportMemGeneric 704 found = dump.find("ImportMemGeneric"); 705 CHECK(found == std::string::npos); 706 707 // Use memory import between backends 708 CHECK((layer4->GetType() == LayerType::MemCopy)); 709 710 // Check output is as expected 711 CHECK(outputData == expectedOutput); 712 } 713 714 #if defined(ARMCOMPUTECL_ENABLED) 715 TEST_CASE("NeonImportEnabledFallbackToCl") 716 { 717 using namespace armnn; 718 719 IRuntime::CreationOptions options; 720 IRuntimePtr runtime(IRuntime::Create(options)); 721 722 // Builds up the structure of the network. 723 INetworkPtr net(INetwork::Create()); 724 725 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 726 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 727 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 728 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 729 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 730 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 731 732 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 733 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 734 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 735 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 736 sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 737 738 TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); 739 740 input0->GetOutputSlot(0).SetTensorInfo(info); 741 input1->GetOutputSlot(0).SetTensorInfo(info); 742 input2->GetOutputSlot(0).SetTensorInfo(info); 743 add->GetOutputSlot(0).SetTensorInfo(info); 744 sub->GetOutputSlot(0).SetTensorInfo(info); 745 746 std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc }; 747 // Use BackendSelectionHint to specify GpuAcc for Subtraction layer 748 sub->BackendSelectionHint(backends[1]); 749 750 // optimize the network 751 OptimizerOptionsOpaque optOptions; 752 optOptions.SetImportEnabled(true); 753 optOptions.SetExportEnabled(true); 754 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 755 756 Graph& graph = GetGraphForTesting(optNet.get()); 757 758 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 759 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 760 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 761 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 762 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 763 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 764 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 765 766 // Checks order is valid. 767 CHECK(CheckOrder(graph, layer0, layer1)); 768 CHECK(CheckOrder(graph, layer1, layer2)); 769 CHECK(CheckOrder(graph, layer2, layer3)); 770 CHECK(CheckOrder(graph, layer3, layer4)); 771 CHECK(CheckOrder(graph, layer4, layer5)); 772 CHECK(CheckOrder(graph, layer5, layer6)); 773 774 // Use memory import between backends 775 CHECK((layer4->GetType() == LayerType::MemCopy)); 776 777 // Correctly use backend hint 778 CHECK((layer5->GetBackendId() == Compute::GpuAcc )); 779 780 // Load it into the runtime. It should pass. 781 NetworkId netId; 782 std::string ignoredErrorMessage; 783 784 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 785 786 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 787 788 // Creates structures for input & output 789 std::vector<float> inputData0 790 { 791 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f 792 }; 793 std::vector<float> inputData1 794 { 795 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f 796 }; 797 std::vector<float> inputData2 798 { 799 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f 800 }; 801 802 std::vector<float> outputData(16); 803 804 std::vector<float> expectedOutput 805 { 806 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f 807 }; 808 809 // Creates structures for input & output 810 unsigned int numElements = info.GetNumElements(); 811 size_t totalBytes = numElements * sizeof(float); 812 813 // Prepare aligned data 814 const size_t alignment = 64; 815 size_t space = totalBytes + alignment + alignment; 816 auto inputData = std::make_unique<uint8_t[]>(space); 817 void* alignedInputPtr = inputData.get(); 818 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); 819 820 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr); 821 std::copy(inputData2.begin(), inputData2.end(), intputPtr); 822 823 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 824 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 825 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 826 inputTensorInfo0.SetConstant(true); 827 inputTensorInfo1.SetConstant(true); 828 inputTensorInfo2.SetConstant(true); 829 830 InputTensors inputTensors 831 { 832 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 833 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 834 { 2, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr) } 835 }; 836 OutputTensors outputTensors 837 { 838 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 839 }; 840 841 runtime->GetProfiler(netId)->EnableProfiling(true); 842 843 // Do the inference 844 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 845 846 // Retrieve the Profiler.Print() output to get the workload execution 847 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 848 std::stringstream ss; 849 profilerManager.GetProfiler()->Print(ss);; 850 std::string dump = ss.str(); 851 852 // Executed Subtraction using GpuAcc 853 std::size_t found = dump.find("ClSubtractionWorkload_Execute"); 854 CHECK(found != std::string::npos); 855 856 // Contain CopyMemGeneric 857 found = dump.find("CopyMemGeneric"); 858 CHECK(found != std::string::npos); 859 860 // Check output is as expected 861 for(unsigned int i = 0; i < numElements; ++i) 862 { 863 CHECK(outputData[i] == expectedOutput[i]); 864 } 865 runtime->UnloadNetwork(netId); 866 } 867 868 TEST_CASE("NeonImportDisabledFallbackToCl") 869 { 870 using namespace armnn; 871 872 IRuntime::CreationOptions options; 873 IRuntimePtr runtime(IRuntime::Create(options)); 874 875 // Builds up the structure of the network. 876 INetworkPtr net(INetwork::Create()); 877 878 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 879 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 880 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 881 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 882 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 883 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 884 885 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 886 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 887 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 888 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 889 sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 890 891 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 892 893 input0->GetOutputSlot(0).SetTensorInfo(info); 894 input1->GetOutputSlot(0).SetTensorInfo(info); 895 input2->GetOutputSlot(0).SetTensorInfo(info); 896 add->GetOutputSlot(0).SetTensorInfo(info); 897 sub->GetOutputSlot(0).SetTensorInfo(info); 898 899 std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc }; 900 // Use BackendSelectionHint to specify GpuAcc for Subtraction layer 901 sub->BackendSelectionHint(backends[1]); 902 903 // optimize the network 904 OptimizerOptionsOpaque optOptions; 905 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 906 907 Graph& graph = GetGraphForTesting(optNet.get()); 908 909 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 910 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 911 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 912 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 913 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 914 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 915 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 916 917 // Checks order is valid. 918 CHECK(CheckOrder(graph, layer0, layer1)); 919 CHECK(CheckOrder(graph, layer1, layer2)); 920 CHECK(CheckOrder(graph, layer2, layer3)); 921 CHECK(CheckOrder(graph, layer3, layer4)); 922 CHECK(CheckOrder(graph, layer4, layer5)); 923 CHECK(CheckOrder(graph, layer5, layer6)); 924 925 // Use memory import between backends 926 CHECK((layer4->GetType() == LayerType::MemCopy)); 927 928 // Correctly use backend hint 929 CHECK((layer5->GetBackendId() == Compute::GpuAcc )); 930 931 // Load it into the runtime. It should pass. 932 NetworkId netId; 933 runtime->LoadNetwork(netId, std::move(optNet)); 934 935 // Creates structures for input & output 936 std::vector<float> inputData0 937 { 938 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 939 }; 940 std::vector<float> inputData1 941 { 942 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 943 }; 944 std::vector<float> inputData2 945 { 946 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 947 }; 948 949 std::vector<float> outputData(12); 950 951 std::vector<float> expectedOutput 952 { 953 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f 954 }; 955 956 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 957 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 958 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 959 inputTensorInfo0.SetConstant(true); 960 inputTensorInfo1.SetConstant(true); 961 inputTensorInfo2.SetConstant(true); 962 963 InputTensors inputTensors 964 { 965 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 966 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 967 { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) } 968 }; 969 OutputTensors outputTensors 970 { 971 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 972 }; 973 974 runtime->GetProfiler(netId)->EnableProfiling(true); 975 976 // Do the inference 977 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 978 979 // Retrieve the Profiler.Print() output to get the workload execution 980 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 981 std::stringstream ss; 982 profilerManager.GetProfiler()->Print(ss);; 983 std::string dump = ss.str(); 984 985 // Executed Subtraction using GpuAcc 986 std::size_t found = dump.find("ClSubtractionWorkload_Execute"); 987 CHECK(found != std::string::npos); 988 989 // Contain CopyMemGeneric 990 found = dump.find("CopyMemGeneric"); 991 CHECK(found != std::string::npos); 992 993 // Check output is as expected 994 CHECK(outputData == expectedOutput); 995 } 996 997 TEST_CASE("NeonImportEnabledFallbackSubgraphToCl") 998 { 999 using namespace armnn; 1000 1001 IRuntime::CreationOptions options; 1002 IRuntimePtr runtime(IRuntime::Create(options)); 1003 1004 // Builds up the structure of the network. 1005 INetworkPtr net(INetwork::Create()); 1006 1007 Pooling2dDescriptor desc; 1008 desc.m_PoolWidth = 2; 1009 desc.m_PoolHeight = 2; 1010 desc.m_StrideX = 2; 1011 desc.m_StrideY = 2; 1012 1013 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 1014 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 1015 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 1016 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 1017 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 1018 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 1019 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 1020 1021 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 1022 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 1023 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 1024 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 1025 sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 1026 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 1027 1028 TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); 1029 TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32); 1030 1031 input0->GetOutputSlot(0).SetTensorInfo(info); 1032 input1->GetOutputSlot(0).SetTensorInfo(info); 1033 input2->GetOutputSlot(0).SetTensorInfo(info); 1034 add->GetOutputSlot(0).SetTensorInfo(info); 1035 sub->GetOutputSlot(0).SetTensorInfo(info); 1036 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 1037 1038 std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc }; 1039 // Use BackendSelectionHint to specify GpuAcc for Subtraction layer 1040 sub->BackendSelectionHint(backends[1]); 1041 1042 // optimize the network 1043 OptimizerOptionsOpaque optOptions; 1044 optOptions.SetImportEnabled(true); 1045 optOptions.SetExportEnabled(true); 1046 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 1047 1048 Graph& graph = GetGraphForTesting(optNet.get()); 1049 1050 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 1051 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 1052 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 1053 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 1054 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 1055 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 1056 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); 1057 armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); 1058 armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); 1059 1060 // Checks order is valid. 1061 CHECK(CheckOrder(graph, layer0, layer1)); 1062 CHECK(CheckOrder(graph, layer1, layer2)); 1063 CHECK(CheckOrder(graph, layer2, layer3)); 1064 CHECK(CheckOrder(graph, layer3, layer4)); 1065 CHECK(CheckOrder(graph, layer4, layer5)); 1066 CHECK(CheckOrder(graph, layer5, layer6)); 1067 CHECK(CheckOrder(graph, layer6, layer7)); 1068 CHECK(CheckOrder(graph, layer7, layer8)); 1069 1070 // Use memory import between backends 1071 CHECK((layer4->GetType() == LayerType::MemCopy)); 1072 CHECK((layer6->GetType() == LayerType::MemCopy)); 1073 1074 // Correctly use backend hint 1075 CHECK((layer5->GetBackendId() == Compute::GpuAcc )); 1076 1077 // Load it into the runtime. It should pass. 1078 NetworkId netId; 1079 std::string ignoredErrorMessage; 1080 1081 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 1082 1083 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 1084 1085 // Creates structures for input & output 1086 std::vector<float> inputData0 1087 { 1088 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f 1089 }; 1090 std::vector<float> inputData1 1091 { 1092 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f 1093 }; 1094 std::vector<float> inputData2 1095 { 1096 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f 1097 }; 1098 1099 std::vector<float> outputData(4); 1100 1101 std::vector<float> expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f }; 1102 1103 // Prepare aligned data 1104 unsigned int numElements = info.GetNumElements(); 1105 size_t totalBytes = numElements * sizeof(float); 1106 const size_t alignment = 64; 1107 size_t space = totalBytes + alignment + alignment; 1108 auto inputData = std::make_unique<uint8_t[]>(space); 1109 void* alignedInputPtr = inputData.get(); 1110 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); 1111 1112 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr); 1113 std::copy(inputData2.begin(), inputData2.end(), intputPtr); 1114 1115 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 1116 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 1117 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 1118 inputTensorInfo0.SetConstant(true); 1119 inputTensorInfo1.SetConstant(true); 1120 inputTensorInfo2.SetConstant(true); 1121 1122 InputTensors inputTensors 1123 { 1124 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 1125 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 1126 { 2, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr) } 1127 }; 1128 OutputTensors outputTensors 1129 { 1130 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 1131 }; 1132 1133 runtime->GetProfiler(netId)->EnableProfiling(true); 1134 1135 // Do the inference 1136 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 1137 1138 // Retrieve the Profiler.Print() output to get the workload execution 1139 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 1140 std::stringstream ss; 1141 profilerManager.GetProfiler()->Print(ss);; 1142 std::string dump = ss.str(); 1143 1144 // Executed Subtraction using GpuAcc 1145 std::size_t found = dump.find("ClSubtractionWorkload_Execute"); 1146 CHECK(found != std::string::npos); 1147 1148 // Correctly switch back to CpuAcc 1149 found = dump.find("NeonPooling2dWorkload_Execute"); 1150 CHECK(found != std::string::npos); 1151 1152 // Contain CopyMemGeneric 1153 found = dump.find("CopyMemGeneric"); 1154 CHECK(found != std::string::npos); 1155 1156 // Contains SyncMemGeneric for output 1157 found = dump.find("SyncMemGeneric"); 1158 CHECK(found != std::string::npos); 1159 1160 // Check output is as expected 1161 CHECK(outputData == expectedOutput); 1162 runtime->UnloadNetwork(netId); 1163 } 1164 1165 TEST_CASE("NeonImportDisableFallbackSubgraphToCl") 1166 { 1167 using namespace armnn; 1168 1169 IRuntime::CreationOptions options; 1170 IRuntimePtr runtime(IRuntime::Create(options)); 1171 1172 // Builds up the structure of the network. 1173 INetworkPtr net(INetwork::Create()); 1174 1175 Pooling2dDescriptor desc; 1176 1177 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 1178 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 1179 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 1180 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 1181 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 1182 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 1183 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 1184 1185 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 1186 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 1187 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 1188 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 1189 sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 1190 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 1191 1192 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 1193 TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); 1194 1195 input0->GetOutputSlot(0).SetTensorInfo(info); 1196 input1->GetOutputSlot(0).SetTensorInfo(info); 1197 input2->GetOutputSlot(0).SetTensorInfo(info); 1198 add->GetOutputSlot(0).SetTensorInfo(info); 1199 sub->GetOutputSlot(0).SetTensorInfo(info); 1200 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 1201 1202 std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc }; 1203 // Use BackendSelectionHint to specify GpuAcc for Subtraction layer 1204 sub->BackendSelectionHint(backends[1]); 1205 1206 // optimize the network 1207 OptimizerOptionsOpaque optOptions; 1208 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 1209 1210 Graph& graph = GetGraphForTesting(optNet.get()); 1211 1212 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 1213 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 1214 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 1215 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 1216 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 1217 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 1218 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); 1219 armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); 1220 armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); 1221 1222 // Checks order is valid. 1223 CHECK(CheckOrder(graph, layer0, layer1)); 1224 CHECK(CheckOrder(graph, layer1, layer2)); 1225 CHECK(CheckOrder(graph, layer2, layer3)); 1226 CHECK(CheckOrder(graph, layer3, layer4)); 1227 CHECK(CheckOrder(graph, layer4, layer5)); 1228 CHECK(CheckOrder(graph, layer5, layer6)); 1229 CHECK(CheckOrder(graph, layer6, layer7)); 1230 CHECK(CheckOrder(graph, layer7, layer8)); 1231 1232 // Use memory import between backends 1233 CHECK((layer4->GetType() == LayerType::MemCopy)); 1234 CHECK((layer6->GetType() == LayerType::MemCopy)); 1235 1236 // Correctly use backend hint 1237 CHECK((layer5->GetBackendId() == Compute::GpuAcc )); 1238 1239 // Load it into the runtime. It should pass. 1240 NetworkId netId; 1241 runtime->LoadNetwork(netId, std::move(optNet)); 1242 1243 // Creates structures for input & output 1244 std::vector<float> inputData0 1245 { 1246 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 1247 }; 1248 std::vector<float> inputData1 1249 { 1250 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 1251 }; 1252 std::vector<float> inputData2 1253 { 1254 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 1255 }; 1256 1257 std::vector<float> outputData(2); 1258 1259 std::vector<float> expectedOutput{ 11.0f, -1.0f }; 1260 1261 armnn::TensorInfo inputTensorInfo0 = runtime->GetInputTensorInfo(netId, 0); 1262 armnn::TensorInfo inputTensorInfo1 = runtime->GetInputTensorInfo(netId, 1); 1263 armnn::TensorInfo inputTensorInfo2 = runtime->GetInputTensorInfo(netId, 2); 1264 inputTensorInfo0.SetConstant(true); 1265 inputTensorInfo1.SetConstant(true); 1266 inputTensorInfo2.SetConstant(true); 1267 1268 InputTensors inputTensors 1269 { 1270 { 0, armnn::ConstTensor(inputTensorInfo0, inputData0.data()) }, 1271 { 1, armnn::ConstTensor(inputTensorInfo1, inputData1.data()) }, 1272 { 2, armnn::ConstTensor(inputTensorInfo2, inputData2.data()) } 1273 }; 1274 OutputTensors outputTensors 1275 { 1276 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 1277 }; 1278 1279 runtime->GetProfiler(netId)->EnableProfiling(true); 1280 1281 // Do the inference 1282 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 1283 1284 // Retrieve the Profiler.Print() output to get the workload execution 1285 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 1286 std::stringstream ss; 1287 profilerManager.GetProfiler()->Print(ss);; 1288 std::string dump = ss.str(); 1289 1290 // Executed Subtraction using GpuAcc 1291 std::size_t found = dump.find("ClSubtractionWorkload_Execute"); 1292 CHECK(found != std::string::npos); 1293 1294 // Correctly switch back to CpuAcc 1295 found = dump.find("NeonPooling2dWorkload_Execute"); 1296 CHECK(found != std::string::npos); 1297 1298 // Contain CopyMemGeneric 1299 found = dump.find("CopyMemGeneric"); 1300 CHECK(found != std::string::npos); 1301 1302 // Check output is as expected 1303 CHECK(outputData == expectedOutput); 1304 } 1305 #endif 1306 1307 } 1308