• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "LayersFwd.hpp"
7 
8 #include <ResolveType.hpp>
9 #include <armnn/INetwork.hpp>
10 #include <GraphUtils.hpp>
11 #include <TestUtils.hpp>
12 
13 #include <doctest/doctest.h>
14 
15 #include <armnnUtils/QuantizeHelper.hpp>
16 #include <string>
17 
18 using namespace armnn;
19 
20 namespace
21 {
22 
23 template<typename T>
GetVector(unsigned int size,float initial,float increment)24 std::vector<T> GetVector(unsigned int size, float initial, float increment)
25 {
26     std::vector<float> typeVector(size, initial);
27     std::vector<T>     vector(size);
28 
29     if (size > 1)
30     {
31         for (unsigned int i = 0; i < size; ++i)
32         {
33             vector[i] = T(initial + (increment * static_cast<float>(i)));
34         }
35     }
36     return vector;
37 }
38 
39 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
40 struct Convolution2dTest
41 {
42     using LayerType = Convolution2dLayer;
43     static const bool isElementWise = false;
44     static const bool isConstTensorAsInputSupported = true;
45 
GetInputShape__anond1a1e5130111::Convolution2dTest46     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::Convolution2dTest47     static TensorShape GetOutputShape()  { return TensorShape( {1, 3, 3, 4}); }  // NHWCout
GetWeightsShape__anond1a1e5130111::Convolution2dTest48     static TensorShape GetWeightsShape() { return TensorShape( {4, 2, 2, 3}); }  // CoutHWCin
49 
50     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
51     constexpr static const unsigned int outputSize = 36; // batchOut * heightOut * widthOut * channelOut
52 
AddReceiverLayer__anond1a1e5130111::Convolution2dTest53     static IConnectableLayer* AddReceiverLayer(INetwork* network,
54                                                const char* name,
55                                                float scale = 1.f,
56                                                int32_t offset = 0)
57     {
58         IgnoreUnused(scale);
59         IgnoreUnused(offset);
60 
61         Convolution2dDescriptor descriptor;
62         descriptor.m_DataLayout  = DataLayout::NHWC;
63         descriptor.m_StrideX     = 1;
64         descriptor.m_StrideY     = 1;
65 
66         return network->AddConvolution2dLayer(descriptor, name);
67     }
68 
AddConstantLayers__anond1a1e5130111::Convolution2dTest69     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
70                                                              float scale = 1.f,
71                                                              int32_t offset = 0)
72     {
73 
74         std::vector<float> weightsData   = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
75                                              11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
76                                              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
77                                              31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42 };
78         std::vector<T>     weightsVector = armnnUtils::QuantizedVector<T>(weightsData, scale, offset);
79         TensorInfo         weightsInfo(GetWeightsShape(), ArmnnType, scale, offset, true);
80         ConstTensor        weights(weightsInfo, weightsVector);
81 
82         IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
83         weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
84 
85         std::vector<IConnectableLayer*> layers = { weightsLayer };
86         return layers;
87     }
88 };
89 
90 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
91 struct DWConvolution2dTest
92 {
93 public:
94     using LayerType = DepthwiseConvolution2dLayer;
95     static const bool isElementWise = false;
96     static const bool isConstTensorAsInputSupported = true;
97 
GetInputShape__anond1a1e5130111::DWConvolution2dTest98     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }   // [N,H,W,Cin]
GetOutputShape__anond1a1e5130111::DWConvolution2dTest99     static TensorShape GetOutputShape()  { return TensorShape( {1, 3, 3, 12}); }  // [N,H,W,Cout]
GetWeightsShape__anond1a1e5130111::DWConvolution2dTest100     static TensorShape GetWeightsShape() { return TensorShape( {1, 2, 2, 12}); }  // [1,H,W,Cout]
101 
102     constexpr static const unsigned int inputSize  = 48; //batchIn * heightIn * widthIn * channelIn;
103     constexpr static const unsigned int outputSize = 108; //batchOut * heightOut * widthOut * channelOut;
104 
AddReceiverLayer__anond1a1e5130111::DWConvolution2dTest105     static IConnectableLayer* AddReceiverLayer(INetwork* network,
106                                                const char* name,
107                                                float scale = 1.f,
108                                                int32_t offset = 0)
109     {
110         IgnoreUnused(scale);
111         IgnoreUnused(offset);
112 
113         DepthwiseConvolution2dDescriptor descriptor;
114         descriptor.m_BiasEnabled = false;
115         descriptor.m_DataLayout  = DataLayout::NHWC;
116         descriptor.m_StrideX     = 1;
117         descriptor.m_StrideY     = 1;
118 
119         return network->AddDepthwiseConvolution2dLayer(descriptor, name);
120     }
121 
AddConstantLayers__anond1a1e5130111::DWConvolution2dTest122     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
123                                                              float scale = 1.f,
124                                                              int32_t offset = 0)
125     {
126         std::vector<float> weightsData   = { 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
127                                              11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
128                                              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
129                                              31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42};
130         std::vector<T>     weightsVector = armnnUtils::QuantizedVector<T>(weightsData, scale, offset);
131         TensorInfo         weightsInfo(GetWeightsShape(), ArmnnType, scale, offset, true);
132         ConstTensor        weights(weightsInfo, weightsVector);
133 
134         IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
135         weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
136 
137         std::vector<IConnectableLayer*> layers = { weightsLayer };
138         return layers;
139     }
140 };
141 
142 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
143 struct FullyConnectedTest
144 {
145 public:
146     using LayerType = FullyConnectedLayer;
147     static const bool isElementWise = false;
148     static const bool isConstTensorAsInputSupported = true;
149 
GetInputShape__anond1a1e5130111::FullyConnectedTest150     static TensorShape GetInputShape()   { return TensorShape( {2, 5, 1, 1}); } // NCinHW
GetOutputShape__anond1a1e5130111::FullyConnectedTest151     static TensorShape GetOutputShape()  { return TensorShape( {2, 3}); }       // NCout
GetWeightsShape__anond1a1e5130111::FullyConnectedTest152     static TensorShape GetWeightsShape() { return TensorShape( {5, 3}); }       // CinCout
153 
154     constexpr static const unsigned int inputSize  = 10; // batchIn * heightIn * widthIn * channelIn
155     constexpr static const unsigned int outputSize = 6;  // batchOut * heightOut * widthOut * channelOut
156 
AddReceiverLayer__anond1a1e5130111::FullyConnectedTest157     static IConnectableLayer* AddReceiverLayer(INetwork* network,
158                                                const char* name,
159                                                float scale = 1.f,
160                                                int32_t offset = 0)
161     {
162         IgnoreUnused(scale);
163         IgnoreUnused(offset);
164 
165         FullyConnectedDescriptor descriptor;
166         descriptor.m_BiasEnabled = false;
167 
168         return network->AddFullyConnectedLayer(descriptor, name);
169     }
170 
AddConstantLayers__anond1a1e5130111::FullyConnectedTest171     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
172                                                              float scale = 1.f,
173                                                              int32_t offset = 0)
174     {
175         std::vector<float> weightsData   = { 1,  2,  3,  4,  5,
176                                              6,  7,  8,  9, 10,
177                                              11, 12, 13, 14, 15};
178         std::vector<T>     weightsVector = armnnUtils::QuantizedVector<T>(weightsData, scale, offset);
179         TensorInfo         weightsInfo(GetWeightsShape(), ArmnnType, scale, offset, true);
180         ConstTensor        weights(weightsInfo, weightsVector);
181 
182         IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
183         weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
184 
185         std::vector<IConnectableLayer*> layers = { weightsLayer };
186         return layers;
187     }
188 };
189 
190 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
191 struct BatchNormTest
192 {
193 public:
194     using LayerType = BatchNormalizationLayer;
195     static const bool isElementWise = false;
196     static const bool isConstTensorAsInputSupported = false;
197 
GetInputShape__anond1a1e5130111::BatchNormTest198     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::BatchNormTest199     static TensorShape GetOutputShape()  { return TensorShape( {1, 4, 4, 3}); }  // NHWCout
200 
201     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
202     constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut
203 
AddReceiverLayer__anond1a1e5130111::BatchNormTest204     static IConnectableLayer* AddReceiverLayer(INetwork* network,
205                                                const char* name,
206                                                float scale = 1.f,
207                                                int32_t offset = 0)
208     {
209         IgnoreUnused(scale);
210         IgnoreUnused(offset);
211 
212         BatchNormalizationDescriptor descriptor;
213         descriptor.m_DataLayout = DataLayout::NHWC;
214 
215         std::vector<T> betaVector     = GetVector<T>(GetOutputShape()[3], 0.0f, 0.2f);
216         std::vector<T> gammaVector    = GetVector<T>(GetOutputShape()[3], 0.5f, 0.1f);
217         std::vector<T> meanVector     = GetVector<T>(GetOutputShape()[3], 0.1f, 0.1f);
218         std::vector<T> varianceVector = GetVector<T>(GetOutputShape()[3], 1.0f, 0.1f);
219 
220         const unsigned int outputChannelSize[] = { GetOutputShape()[3] };
221         ConstTensor beta(TensorInfo(1, outputChannelSize, ArmnnType, 0.0f, 0, true), betaVector);
222         ConstTensor gamma(TensorInfo(1, outputChannelSize, ArmnnType, 0.0f, 0, true), gammaVector);
223         ConstTensor mean(TensorInfo(1, outputChannelSize, ArmnnType, 0.0f, 0, true), meanVector);
224         ConstTensor variance(TensorInfo(1, outputChannelSize, ArmnnType, 0.0f, 0, true), varianceVector);
225 
226         return network->AddBatchNormalizationLayer(descriptor, mean, variance, beta, gamma, name);
227     }
228 
AddConstantLayers__anond1a1e5130111::BatchNormTest229     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
230                                                              float scale = 1.f,
231                                                              int32_t offset = 0)
232     {
233         IgnoreUnused(network);
234         IgnoreUnused(scale);
235         IgnoreUnused(offset);
236         return {};
237     }
238 };
239 
240 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
241 struct MultiplicationTest
242 {
243     using LayerType = MultiplicationLayer;
244     static const bool isElementWise = true;
245     static const bool isConstTensorAsInputSupported = false;
246 
GetInputShape__anond1a1e5130111::MultiplicationTest247     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::MultiplicationTest248     static TensorShape GetOutputShape()  { return TensorShape( {1, 4, 4, 3}); }  // NHWCout
249 
250     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
251     constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut
252 
AddReceiverLayer__anond1a1e5130111::MultiplicationTest253     static IConnectableLayer* AddReceiverLayer(INetwork* network,
254                                                const char* name,
255                                                float scale = 1.f,
256                                                int32_t offset = 0)
257     {
258         IgnoreUnused(scale);
259         IgnoreUnused(offset);
260 
261         ARMNN_NO_DEPRECATE_WARN_BEGIN
262         return network->AddMultiplicationLayer(name);
263         ARMNN_NO_DEPRECATE_WARN_END
264     }
265 
AddConstantLayers__anond1a1e5130111::MultiplicationTest266     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
267                                                              float scale = 1.f,
268                                                              int32_t offset = 0)
269     {
270         IgnoreUnused(network);
271         IgnoreUnused(scale);
272         IgnoreUnused(offset);
273         return {};
274     }
275 };
276 
277 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
278 struct AdditionTest
279 {
280     using LayerType = AdditionLayer;
281     static const bool isElementWise = true;
282     static const bool isConstTensorAsInputSupported = false;
283 
GetInputShape__anond1a1e5130111::AdditionTest284     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::AdditionTest285     static TensorShape GetOutputShape()  { return TensorShape( {1, 4, 4, 3}); }  // NHWCout
286 
287     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
288     constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut
289 
AddReceiverLayer__anond1a1e5130111::AdditionTest290     static IConnectableLayer* AddReceiverLayer(INetwork* network,
291                                                const char* name,
292                                                float scale = 1.f,
293                                                int32_t offset = 0)
294     {
295         IgnoreUnused(scale);
296         IgnoreUnused(offset);
297 
298         ARMNN_NO_DEPRECATE_WARN_BEGIN
299         return network->AddAdditionLayer(name);
300         ARMNN_NO_DEPRECATE_WARN_END
301     }
302 
AddConstantLayers__anond1a1e5130111::AdditionTest303     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
304                                                              float scale = 1.f,
305                                                              int32_t offset = 0)
306     {
307         IgnoreUnused(network);
308         IgnoreUnused(scale);
309         IgnoreUnused(offset);
310         return {};
311     }
312 };
313 
314 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
315 struct SubtractionTest
316 {
317     using LayerType = SubtractionLayer;
318     static const bool isElementWise = true;
319     static const bool isConstTensorAsInputSupported = false;
320 
GetInputShape__anond1a1e5130111::SubtractionTest321     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::SubtractionTest322     static TensorShape GetOutputShape()  { return TensorShape( {1, 4, 4, 3}); }  // NHWCout
323 
324     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
325     constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut
326 
AddReceiverLayer__anond1a1e5130111::SubtractionTest327     static IConnectableLayer* AddReceiverLayer(INetwork* network,
328                                                const char* name,
329                                                float scale = 1.f,
330                                                int32_t offset = 0)
331     {
332         IgnoreUnused(scale);
333         IgnoreUnused(offset);
334 
335         ARMNN_NO_DEPRECATE_WARN_BEGIN
336         return network->AddSubtractionLayer(name);
337         ARMNN_NO_DEPRECATE_WARN_END
338     }
339 
AddConstantLayers__anond1a1e5130111::SubtractionTest340     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
341                                                              float scale = 1.f,
342                                                              int32_t offset = 0)
343     {
344         IgnoreUnused(network);
345         IgnoreUnused(scale);
346         IgnoreUnused(offset);
347         return {};
348     }
349 };
350 
351 template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
352 struct DivisionTest
353 {
354     using LayerType = DivisionLayer;
355     static const bool isElementWise = true;
356     static const bool isConstTensorAsInputSupported = false;
357 
GetInputShape__anond1a1e5130111::DivisionTest358     static TensorShape GetInputShape()   { return TensorShape( {1, 4, 4, 3}); }  // NHWCin
GetOutputShape__anond1a1e5130111::DivisionTest359     static TensorShape GetOutputShape()  { return TensorShape( {1, 4, 4, 3}); }  // NHWCout
360 
361     constexpr static const unsigned int inputSize  = 48; // batchIn * heightIn * widthIn * channelIn
362     constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut
363 
AddReceiverLayer__anond1a1e5130111::DivisionTest364     static IConnectableLayer* AddReceiverLayer(INetwork* network,
365                                                const char* name,
366                                                float scale = 1.f,
367                                                int32_t offset = 0)
368     {
369         IgnoreUnused(scale);
370         IgnoreUnused(offset);
371 
372         ARMNN_NO_DEPRECATE_WARN_BEGIN
373         return network->AddDivisionLayer(name);
374         ARMNN_NO_DEPRECATE_WARN_END
375     }
376 
AddConstantLayers__anond1a1e5130111::DivisionTest377     static std::vector<IConnectableLayer*> AddConstantLayers(INetwork* network,
378                                                              float scale = 1.f,
379                                                              int32_t offset = 0)
380     {
381         IgnoreUnused(network);
382         IgnoreUnused(scale);
383         IgnoreUnused(offset);
384         return {};
385     }
386 };
387 
388 template<typename LayerTest,
389          DataType ArmnnType>
CreateNetwork(ActivationDescriptor activationDescriptor,bool preventFusing,float scale,int32_t offset)390 INetworkPtr CreateNetwork(ActivationDescriptor activationDescriptor, bool preventFusing,
391                          float scale, int32_t offset)
392 {
393     // Create a network
394     INetworkPtr network = INetwork::Create();
395 
396     IConnectableLayer* inputLayer = network->AddInputLayer(0);
397 
398     IConnectableLayer* receiverLayer = LayerTest::AddReceiverLayer(network.get(),
399                                                                    "receiverLayer",
400                                                                    scale,
401                                                                    offset);
402 
403     IConnectableLayer* activationLayer = network->AddActivationLayer(activationDescriptor,
404                                                                      "activation");
405 
406     IConnectableLayer* outputLayer  = network->AddOutputLayer(0);
407     IConnectableLayer* output2Layer = preventFusing ? network->AddOutputLayer(1) : nullptr;
408 
409     // If ConstTensorAsInputs is supported weights and bias are stored as constant layers.
410     if (LayerTest::isConstTensorAsInputSupported)
411     {
412         std::vector<IConnectableLayer*> constantLayers = LayerTest::AddConstantLayers(network.get(),
413                                                                                       scale,
414                                                                                       offset);
415 
416         // Connect constant layers to receiverLayer.
417         for (unsigned int i = 0; i < constantLayers.size(); ++i)
418         {
419             constantLayers[i]->GetOutputSlot(0).Connect(receiverLayer->GetInputSlot(i + 1));
420         }
421     }
422 
423     // Define layers information
424     TensorInfo inputInfo(LayerTest::GetInputShape(), ArmnnType, scale, offset);
425     TensorInfo outputInfo(LayerTest::GetOutputShape(), ArmnnType, scale, offset);
426 
427     // Set layer information
428     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
429     receiverLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
430     activationLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
431 
432     // Connect layers
433     inputLayer->GetOutputSlot(0).Connect(receiverLayer->GetInputSlot(0));
434     receiverLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
435     activationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
436 
437     if (LayerTest::isElementWise)
438     {
439         inputLayer->GetOutputSlot(0).Connect(receiverLayer->GetInputSlot(1));
440     }
441     if (preventFusing)
442     {
443         receiverLayer->GetOutputSlot(0).Connect(output2Layer->GetInputSlot(0));
444     }
445 
446     return network;
447 }
448 
449 template<typename LayerTest,
450          DataType ArmnnType,
451          typename LayerType = typename LayerTest::LayerType,
452          typename T = ResolveType<ArmnnType>>
FuseActivationIntoPreviousLayerTest(ActivationDescriptor activationDescriptor,float tolerance,Compute backendId,float scale=1.f,int32_t offset=0)453 void FuseActivationIntoPreviousLayerTest(ActivationDescriptor activationDescriptor, float tolerance, Compute backendId,
454                                          float scale = 1.f, int32_t offset=0)
455 {
456     // FIRST NETWORK: Fused
457     // Construct ArmNN network
458     INetworkPtr networkFused = CreateNetwork<LayerTest, ArmnnType>(activationDescriptor, false, scale, offset);
459 
460     // Create ArmNN runtime
461     IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions()); // default options
462 
463     // Optimise ArmNN network
464     IOptimizedNetworkPtr optNetFused = Optimize(*networkFused, {backendId}, run->GetDeviceSpec());
465 
466     Graph& graphFused = GetGraphForTesting(optNetFused.get());
467 
468     auto checkFusedConv2d = [](const Layer* const layer)->bool {
469         return IsLayerOfType<LayerType>(layer) &&
470             (layer->GetNameStr() == "fused-activation-into-receiverLayer");
471     };
472 
473     // If ConstTensorAsInputs is supported, weights and bias are stored as constant layers.
474     if(LayerTest::isConstTensorAsInputSupported)
475     {
476         CHECK(4 == graphFused.GetNumLayers());
477         CHECK(CheckSequence(graphFused.cbegin(),
478                             graphFused.cend(),
479                             &IsLayerOfType<InputLayer>,
480                             &IsLayerOfType<ConstantLayer>,
481                             checkFusedConv2d,
482                             &IsLayerOfType<OutputLayer>));
483 
484         // Check if new constant layer is connected to fused receiver layer.
485         Layer* fusedReceiverLayer = GetFirstLayerWithName(graphFused, "fused-activation-into-receiverLayer");
486         CHECK(fusedReceiverLayer);
487         CHECK(fusedReceiverLayer->GetInputSlot(1).GetConnection() != nullptr);
488     }
489     else
490     {
491         CHECK(3 == graphFused.GetNumLayers());
492         CHECK(CheckSequence(graphFused.cbegin(),
493                             graphFused.cend(),
494                             &IsLayerOfType<InputLayer>,
495                             checkFusedConv2d,
496                             &IsLayerOfType<OutputLayer>));
497     }
498 
499     // Load network into runtime
500     NetworkId networkIdentifier;
501     CHECK(run->LoadNetwork(networkIdentifier, std::move(optNetFused)) == Status::Success);
502 
503     //Creates structures for inputs and outputs.
504     std::vector<float> data = GetVector<float>(LayerTest::inputSize, 1.0f, 0.1f);
505     std::vector<T> inputDataFused = armnnUtils::QuantizedVector<T>(data, scale, offset);
506     std::vector<T> outputDataFused(LayerTest::outputSize);
507 
508     armnn::TensorInfo inputTensorInfo = run->GetInputTensorInfo(networkIdentifier, 0);
509     inputTensorInfo.SetConstant(true);
510 
511     InputTensors  inputTensorsFused{
512         {0, ConstTensor(inputTensorInfo, inputDataFused.data())}};
513     OutputTensors outputTensorsFused{
514         {0, Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputDataFused.data())}};
515 
516     // Execute network
517     CHECK(run->EnqueueWorkload(networkIdentifier, inputTensorsFused, outputTensorsFused) == Status::Success);
518 
519     // SECOND NETWORK: NotFused
520     // Construct ArmNN network
521     INetworkPtr networkNotFused = CreateNetwork<LayerTest, ArmnnType>(activationDescriptor, true, scale, offset);
522 
523     // Create ArmNN runtime
524     IRuntimePtr runNotFused = IRuntime::Create(IRuntime::CreationOptions()); // default options
525 
526     // Optimise ArmNN network
527     IOptimizedNetworkPtr optNetNotFused = Optimize(*networkNotFused, {backendId}, runNotFused->GetDeviceSpec());
528 
529     Graph& graphNotFused = GetGraphForTesting(optNetNotFused.get());
530 
531     // If ConstTensorAsInputs is supported, weights and bias are stored as constant layers.
532     if(LayerTest::isConstTensorAsInputSupported)
533     {
534         CHECK(6 == graphNotFused.GetNumLayers());
535         CHECK(CheckSequence(graphNotFused.cbegin(),
536                             graphNotFused.cend(),
537                             &IsLayerOfType<InputLayer>,
538                             &IsLayerOfType<ConstantLayer>,
539                             &IsLayerOfType<LayerType>,
540                             &IsLayerOfType<ActivationLayer>,
541                             &IsLayerOfType<OutputLayer>,
542                             &IsLayerOfType<OutputLayer>));
543     }
544     else
545     {
546         CHECK(5 == graphNotFused.GetNumLayers());
547         CHECK(CheckSequence(graphNotFused.cbegin(),
548                             graphNotFused.cend(),
549                             &IsLayerOfType<InputLayer>,
550                             &IsLayerOfType<LayerType>,
551                             &IsLayerOfType<ActivationLayer>,
552                             &IsLayerOfType<OutputLayer>,
553                             &IsLayerOfType<OutputLayer>));
554     }
555 
556     // Load network into runtime
557     NetworkId networkIdentifierNotFused;
558     CHECK(runNotFused->LoadNetwork(networkIdentifierNotFused, std::move(optNetNotFused)) == Status::Success);
559 
560     //Creates structures for inputs and outputs.
561     std::vector<T> inputDataNotFused = armnnUtils::QuantizedVector<T>(data, scale, offset);
562     std::vector<T> outputDataNotFused(LayerTest::outputSize);
563     std::vector<T> outputData2NotFused(LayerTest::outputSize);
564 
565     TensorInfo inputTensorInfoNotFused = runNotFused->GetInputTensorInfo(networkIdentifierNotFused, 0);
566     inputTensorInfoNotFused.SetConstant(true);
567 
568     InputTensors  inputTensorsNotFused{
569         {0, ConstTensor(inputTensorInfoNotFused, inputDataNotFused.data())}};
570     OutputTensors outputTensorsNotFused{
571         {0, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 0), outputDataNotFused.data())},
572         {1, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 1), outputData2NotFused.data())}};
573 
574     // Execute network
575     CHECK(runNotFused->EnqueueWorkload(networkIdentifierNotFused, inputTensorsNotFused, outputTensorsNotFused)
576                == Status::Success);
577 
578     // Check the output of the fused-activation matches with the output of the activation in the "NotFused" network
579     for (unsigned int n = 0; n < outputDataFused.size(); ++n)
580     {
581         auto outputNotFused = static_cast<float>(outputDataNotFused[n]);
582         CHECK(static_cast<float>(outputDataFused[n]) == doctest::Approx(outputNotFused).epsilon(tolerance));
583     }
584 }
585 
586 template<typename LayerTest,
587          DataType ArmnnType,
588          typename LayerType = typename LayerTest::LayerType,
589          typename T = ResolveType<ArmnnType>>
FuseActivationSimpleTest(ActivationDescriptor activationDescriptor,Compute backendId,float scale=1.f,int32_t offset=0)590 bool FuseActivationSimpleTest(ActivationDescriptor activationDescriptor, Compute backendId,
591                               float scale = 1.f, int32_t offset = 0)
592 {
593     bool success;
594     try
595     {
596         // Construct ArmNN network
597         INetworkPtr networkFused = CreateNetwork<LayerTest, ArmnnType>(activationDescriptor, false, scale, offset);
598 
599         // Create ArmNN runtime
600         IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions()); // default options
601 
602         // Optimise ArmNN network
603         IOptimizedNetworkPtr optNetFused = Optimize(*networkFused, {backendId}, run->GetDeviceSpec());
604 
605         // Load network into runtime
606         NetworkId networkIdentifier;
607         CHECK(run->LoadNetwork(networkIdentifier, std::move(optNetFused)) == Status::Success);
608 
609         //Creates structures for inputs and outputs.
610         std::vector<float> data           = GetVector<float>(LayerTest::inputSize, 1.0f, 0.1f);
611         std::vector<T>     inputDataFused = armnnUtils::QuantizedVector<T>(data, scale, offset);
612         std::vector<T>     outputDataFused(LayerTest::outputSize);
613 
614         TensorInfo inputTensorInfo = run->GetInputTensorInfo(networkIdentifier, 0);
615         inputTensorInfo.SetConstant(true);
616 
617         InputTensors  inputTensorsFused{
618             {0, ConstTensor(inputTensorInfo, inputDataFused.data())}};
619         OutputTensors outputTensorsFused{
620             {0, Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputDataFused.data())}};
621 
622         // Execute network
623         run->EnqueueWorkload(networkIdentifier, inputTensorsFused, outputTensorsFused);
624 
625         success = true;
626     }
627     catch (const std::exception& e)
628     {
629         std::cerr << e.what() << std::endl;
630         success = false;
631     }
632 
633     return success;
634 }
635 
636 }
637 
638 #if defined(ARMCOMPUTENEON_ENABLED)
639 TEST_SUITE("Optimizer")
640 {
641 // ReLu fused into Receiver Layers Float32
642 TEST_CASE("FuseReLUIntoConvFloat32CpuAccTest")
643 {
644     ActivationDescriptor activationDescriptor;
645     activationDescriptor.m_Function = ActivationFunction::ReLu;
646 
647     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
648         (activationDescriptor, 0.0001f, Compute::CpuAcc);
649 }
650 TEST_CASE("FuseReLUIntoDWConvFloat32CpuAccTest")
651 {
652     ActivationDescriptor activationDescriptor;
653     activationDescriptor.m_Function = ActivationFunction::ReLu;
654 
655     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::Float32>, DataType::Float32>
656         (activationDescriptor, 0.0001f, Compute::CpuAcc);
657 }
658 TEST_CASE("FuseReLUIntoFullyConnectedFloat32CpuAccTest")
659 {
660     ActivationDescriptor activationDescriptor;
661     activationDescriptor.m_Function = ActivationFunction::ReLu;
662 
663     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
664         (activationDescriptor, 0.0001f, Compute::CpuAcc);
665 }
666 TEST_CASE("FuseReLUIntoBatchNormFloat32CpuAccTest")
667 {
668     ActivationDescriptor activationDescriptor;
669     activationDescriptor.m_Function = ActivationFunction::ReLu;
670 
671     FuseActivationIntoPreviousLayerTest<BatchNormTest<DataType::Float32>, DataType::Float32>
672         (activationDescriptor, 0.0001f, Compute::CpuAcc);
673 }
674 
675 // BoundedReLu fused into Receiver Layers Float32
676 TEST_CASE("FuseBoundedReLUIntoConvFloat32CpuAccTest")
677 {
678     ActivationDescriptor activationDescriptor;
679     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
680     activationDescriptor.m_A = 1.0f;
681     activationDescriptor.m_B = -1.0f;
682 
683     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
684         (activationDescriptor, 0.0001f, Compute::CpuAcc);
685 }
686 TEST_CASE("FuseBoundedReLUIntoDWConvFloat32CpuAccTest")
687 {
688     ActivationDescriptor activationDescriptor;
689     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
690     activationDescriptor.m_A = 1.0f;
691     activationDescriptor.m_B = -1.0f;
692 
693     FuseActivationIntoPreviousLayerTest < DWConvolution2dTest < DataType::Float32 > , DataType::Float32 >
694         (activationDescriptor, 0.0001f, Compute::CpuAcc);
695 }
696 TEST_CASE("FuseBoundedReLUIntoFullyConnectedFloat32CpuAccTest")
697 {
698     ActivationDescriptor activationDescriptor;
699     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
700     activationDescriptor.m_A = 1.0f;
701     activationDescriptor.m_B = -1.0f;
702 
703     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
704         (activationDescriptor, 0.0001f, Compute::CpuAcc);
705 }
706 TEST_CASE("FuseBoundedReLUIntoBatchNormFloat32CpuAccTest")
707 {
708     ActivationDescriptor activationDescriptor;
709     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
710     activationDescriptor.m_A = 1.0f;
711     activationDescriptor.m_B = -1.0f;
712 
713     FuseActivationIntoPreviousLayerTest<BatchNormTest<DataType::Float32>, DataType::Float32>
714         (activationDescriptor, 0.0001f, Compute::CpuAcc);
715 }
716 
717 // ReLU fused into Receiver Layers QAsymmU8
718 TEST_CASE("FuseReLUIntoConvQAsymmU8CpuAccTest")
719 {
720     ActivationDescriptor activationDescriptor;
721     activationDescriptor.m_Function = ActivationFunction::ReLu;
722 
723     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
724         (activationDescriptor, 0.0001f, Compute::CpuAcc);
725 }
726 TEST_CASE("FuseReLUIntoDWConvQAsymmU8CpuAccTest")
727 {
728     ActivationDescriptor activationDescriptor;
729     activationDescriptor.m_Function = ActivationFunction::ReLu;
730 
731     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
732         (activationDescriptor, 0.0001f, Compute::CpuAcc);
733 }
734 TEST_CASE("FuseReLUIntoFullyConnectedQAsymmU8CpuAccTest")
735 {
736     ActivationDescriptor activationDescriptor;
737     activationDescriptor.m_Function = ActivationFunction::ReLu;
738 
739     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
740         (activationDescriptor, 0.0001f, Compute::CpuAcc);
741 }
742 
743 // BoundedReLu fused into Receiver Layers QAsymmS8
744 TEST_CASE("FuseBoundedReLUIntoConvQASymmS8CpuAccTest")
745 {
746     ActivationDescriptor activationDescriptor;
747     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
748     activationDescriptor.m_A = 6.0f;
749     activationDescriptor.m_B = 0.0f;
750 
751     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::QAsymmS8>, DataType::QAsymmS8>
752         (activationDescriptor, 0.0001f, Compute::CpuAcc);
753 }
754 TEST_CASE("FuseBoundedReLUIntoDWConvQASymmS8CpuAccTest")
755 {
756     ActivationDescriptor activationDescriptor;
757     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
758     activationDescriptor.m_A = 6.0f;
759     activationDescriptor.m_B = 0.0f;
760 
761     FuseActivationIntoPreviousLayerTest < DWConvolution2dTest < DataType::QAsymmS8 > , DataType::QAsymmS8 >
762         (activationDescriptor, 0.0001f, Compute::CpuAcc);
763 }
764 TEST_CASE("FuseBoundedReLUIntoFullyConnectedQASymmS8CpuAccTest")
765 {
766     ActivationDescriptor activationDescriptor;
767     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
768     activationDescriptor.m_A = 6.0f;
769     activationDescriptor.m_B = 0.0f;
770 
771     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::QAsymmS8>, DataType::QAsymmS8>
772         (activationDescriptor, 0.0001f, Compute::CpuAcc);
773 }
774 
775 // TanH fused into Receiver Layers Float32
776 TEST_CASE("FuseTanHIntoConvFloat32CpuAccTest")
777 {
778     ActivationDescriptor activationDescriptor;
779     activationDescriptor.m_Function = ActivationFunction::TanH;
780 
781     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
782         (activationDescriptor, 0.0001f, Compute::CpuAcc);
783 }
784 
785 // HardSwish fused into Receiver Layers Float32
786 TEST_CASE("FuseHardSwishIntoConvFloat32CpuAccTest")
787 {
788     ActivationDescriptor activationDescriptor;
789     activationDescriptor.m_Function = ActivationFunction::HardSwish;
790 
791     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
792         (activationDescriptor, 0.0001f, Compute::CpuAcc);
793 }
794 
795 // Test that all receiver layers follow by all activation layers work, either fused or not fused
796 TEST_CASE("LayerFollowedByActivationFloat32CpuAccTest")
797 {
798     ActivationDescriptor activationDescriptor;
799     for (int i = 0; i != 12; ++i)
800     {
801         activationDescriptor.m_Function = static_cast<ActivationFunction>(i);
802         activationDescriptor.m_A = 1.0f;
803         activationDescriptor.m_B = -1.0f;
804         CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
805             (activationDescriptor, Compute::CpuAcc)), "Convolution + Activation function " << i);
806         CHECK_MESSAGE((FuseActivationSimpleTest<DWConvolution2dTest<DataType::Float32>, DataType::Float32>
807             (activationDescriptor, Compute::CpuAcc)), "DepthwiseConvolution + Activation function " << i);
808         CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
809             (activationDescriptor, Compute::CpuAcc)), "FullyConnected + Activation function " << i);
810         CHECK_MESSAGE((FuseActivationSimpleTest<BatchNormTest<DataType::Float32>, DataType::Float32>
811             (activationDescriptor, Compute::CpuAcc)), "BatchNorm + Activation function " << i);
812     }
813 }
814 TEST_CASE("LayerFollowedByActivationFloat16CpuAccTest")
815 {
816     ActivationDescriptor activationDescriptor;
817     for (int i = 0; i != 12; ++i)
818     {
819         activationDescriptor.m_Function = static_cast<ActivationFunction>(i);
820         activationDescriptor.m_A = 1.0f;
821         activationDescriptor.m_B = -1.0f;
822         CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::Float16>, DataType::Float16>
823             (activationDescriptor, Compute::CpuAcc)), "Convolution + Activation function " << i);
824         CHECK_MESSAGE((FuseActivationSimpleTest<DWConvolution2dTest<DataType::Float16>, DataType::Float16>
825             (activationDescriptor, Compute::CpuAcc)), "DepthwiseConvolution + Activation function " << i);
826         CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::Float16>, DataType::Float16>
827             (activationDescriptor, Compute::CpuAcc)), "FullyConnected + Activation function " << i);
828         CHECK_MESSAGE((FuseActivationSimpleTest<BatchNormTest<DataType::Float16>, DataType::Float16>
829             (activationDescriptor, Compute::CpuAcc)), "BatchNorm + Activation function " << i);
830     }
831 }
832 TEST_CASE("LayerFollowedByActivationQAsymmU8CpuAccTest")
833 {
834     ActivationDescriptor activationDescriptor;
835 
836     activationDescriptor.m_Function = ActivationFunction::Sigmoid;
837     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
838         (activationDescriptor, Compute::CpuAcc, 1.f / 256.f, 0)), "Convolution + Activation function " <<
839         static_cast<int>(activationDescriptor.m_Function));
840     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
841         (activationDescriptor, Compute::CpuAcc, 1.f / 256.f, 0)), "FullyConnected + Activation function " <<
842         static_cast<int>(activationDescriptor.m_Function));
843 
844     activationDescriptor.m_Function = ActivationFunction::TanH;
845     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
846         (activationDescriptor, Compute::CpuAcc, 1.f / 128.f, 128)), "Convolution + Activation function " <<
847         static_cast<int>(activationDescriptor.m_Function));
848     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
849         (activationDescriptor, Compute::CpuAcc, 1.f / 128.f, 128)), "FullyConnected + Activation function " <<
850         static_cast<int>(activationDescriptor.m_Function));
851 
852     activationDescriptor.m_Function = ActivationFunction::ReLu;
853     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
854         (activationDescriptor, Compute::CpuAcc)), "Convolution + Activation function " <<
855         static_cast<int>(activationDescriptor.m_Function));
856     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
857         (activationDescriptor, Compute::CpuAcc)), "FullyConnected + Activation function " <<
858         static_cast<int>(activationDescriptor.m_Function));
859 
860     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
861     activationDescriptor.m_A = 1.0f;
862     activationDescriptor.m_B = -1.0f;
863     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
864         (activationDescriptor, Compute::CpuAcc)), "Convolution + Activation function " <<
865         static_cast<int>(activationDescriptor.m_Function));
866     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
867         (activationDescriptor, Compute::CpuAcc)), "FullyConnected + Activation function " <<
868         static_cast<int>(activationDescriptor.m_Function));
869 
870     activationDescriptor.m_Function = ActivationFunction::HardSwish;
871     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
872         (activationDescriptor, Compute::CpuAcc)), "Convolution + Activation function " <<
873         static_cast<int>(activationDescriptor.m_Function));
874     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
875         (activationDescriptor, Compute::CpuAcc)), "FullyConnected + Activation function " <<
876         static_cast<int>(activationDescriptor.m_Function));
877 }
878 }
879 #endif
880 
881 #if defined(ARMCOMPUTECL_ENABLED)
882 TEST_SUITE("Optimizer")
883 {
884 // ReLu fused into Receiver Layers Float32
885 TEST_CASE("FuseReLUIntoConvFloat32GpuAccTest")
886 {
887     ActivationDescriptor activationDescriptor;
888     activationDescriptor.m_Function = ActivationFunction::ReLu;
889 
890     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
891         (activationDescriptor, 0.0001f, Compute::GpuAcc);
892 }
893 TEST_CASE("FuseReLUIntoDWConvFloat32GpuAccTest")
894 {
895     ActivationDescriptor activationDescriptor;
896     activationDescriptor.m_Function = ActivationFunction::ReLu;
897 
898     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::Float32>, DataType::Float32>
899         (activationDescriptor, 0.0001f, Compute::GpuAcc);
900 }
901 TEST_CASE("FuseReLUIntoFullyConnectedFloat32GpuAccTest")
902 {
903     ActivationDescriptor activationDescriptor;
904     activationDescriptor.m_Function = ActivationFunction::ReLu;
905 
906     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
907         (activationDescriptor, 0.0001f, Compute::GpuAcc);
908 }
909 TEST_CASE("FuseReLUIntoBatchNormFloat32GpuAccTest")
910 {
911     ActivationDescriptor activationDescriptor;
912     activationDescriptor.m_Function = ActivationFunction::ReLu;
913 
914     FuseActivationIntoPreviousLayerTest<BatchNormTest<DataType::Float32>, DataType::Float32>
915         (activationDescriptor, 0.0001f, Compute::GpuAcc);
916 }
917 TEST_CASE("FuseReLUIntoMulFloat32GpuAccTest")
918 {
919     ActivationDescriptor activationDescriptor;
920     activationDescriptor.m_Function = ActivationFunction::ReLu;
921 
922     FuseActivationIntoPreviousLayerTest<MultiplicationTest<DataType::Float32>, DataType::Float32>
923         (activationDescriptor, 0.0001f, Compute::GpuAcc);
924 }
925 TEST_CASE("FuseReLUIntoAddFloat32GpuAccTest")
926 {
927     ActivationDescriptor activationDescriptor;
928     activationDescriptor.m_Function = ActivationFunction::ReLu;
929 
930     FuseActivationIntoPreviousLayerTest<AdditionTest<DataType::Float32>, DataType::Float32>
931         (activationDescriptor, 0.0001f, Compute::GpuAcc);
932 }
933 TEST_CASE("FuseReLUIntoSubFloat32GpuAccTest")
934 {
935     ActivationDescriptor activationDescriptor;
936     activationDescriptor.m_Function = ActivationFunction::ReLu;
937 
938     FuseActivationIntoPreviousLayerTest<SubtractionTest<DataType::Float32>, DataType::Float32>
939         (activationDescriptor, 0.0001f, Compute::GpuAcc);
940 }
941 TEST_CASE("FuseReLUIntoDivFloat32GpuAccTest")
942 {
943     ActivationDescriptor activationDescriptor;
944     activationDescriptor.m_Function = ActivationFunction::ReLu;
945 
946     FuseActivationIntoPreviousLayerTest<DivisionTest<DataType::Float32>, DataType::Float32>
947         (activationDescriptor, 0.0001f, Compute::GpuAcc);
948 }
949 
950 // BoundedReLu fused into Receiver Layers Float32
951 TEST_CASE("FuseBoundedReLUIntoConvFloat32GpuAccTest")
952 {
953     ActivationDescriptor activationDescriptor;
954     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
955     activationDescriptor.m_A = 1.0f;
956     activationDescriptor.m_B = -1.0f;
957 
958     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
959         (activationDescriptor, 0.0001f, Compute::GpuAcc);
960 }
961 TEST_CASE("FuseBoundedReLUIntoDWConvFloat32GpuAccTest")
962 {
963     ActivationDescriptor activationDescriptor;
964     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
965     activationDescriptor.m_A = 1.0f;
966     activationDescriptor.m_B = -1.0f;
967 
968     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::Float32>, DataType::Float32>
969         (activationDescriptor, 0.0001f, Compute::GpuAcc);
970 }
971 TEST_CASE("FuseBoundedReLUIntoFullyConnectedFloat32GpuAccTest")
972 {
973     ActivationDescriptor activationDescriptor;
974     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
975     activationDescriptor.m_A = 1.0f;
976     activationDescriptor.m_B = -1.0f;
977 
978     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
979         (activationDescriptor, 0.0001f, Compute::GpuAcc);
980 }
981 TEST_CASE("FuseBoundedReLUIntoBatchNormFloat32GpuAccTest")
982 {
983     ActivationDescriptor activationDescriptor;
984     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
985     activationDescriptor.m_A = 1.0f;
986     activationDescriptor.m_B = -1.0f;
987 
988     FuseActivationIntoPreviousLayerTest<BatchNormTest<DataType::Float32>, DataType::Float32>
989         (activationDescriptor, 0.0001f, Compute::GpuAcc);
990 }
991 TEST_CASE("FuseBoundedReLUIntoMulFloat32GpuAccTest")
992 {
993     ActivationDescriptor activationDescriptor;
994     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
995     activationDescriptor.m_A = 1.0f;
996     activationDescriptor.m_B = -1.0f;
997 
998     FuseActivationIntoPreviousLayerTest<MultiplicationTest<DataType::Float32>, DataType::Float32>
999         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1000 }
1001 TEST_CASE("FuseBoundedReLUIntoAddFloat32GpuAccTest")
1002 {
1003     ActivationDescriptor activationDescriptor;
1004     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1005     activationDescriptor.m_A = 1.0f;
1006     activationDescriptor.m_B = -1.0f;
1007 
1008     FuseActivationIntoPreviousLayerTest<AdditionTest<DataType::Float32>, DataType::Float32>
1009         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1010 }
1011 TEST_CASE("FuseBoundedReLUIntoSubFloat32GpuAccTest")
1012 {
1013     ActivationDescriptor activationDescriptor;
1014     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1015     activationDescriptor.m_A = 1.0f;
1016     activationDescriptor.m_B = -1.0f;
1017 
1018     FuseActivationIntoPreviousLayerTest<SubtractionTest<DataType::Float32>, DataType::Float32>
1019         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1020 }
1021 TEST_CASE("FuseBoundedReLUIntoDivFloat32GpuAccTest")
1022 {
1023     ActivationDescriptor activationDescriptor;
1024     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1025     activationDescriptor.m_A = 1.0f;
1026     activationDescriptor.m_B = -1.0f;
1027 
1028     FuseActivationIntoPreviousLayerTest<DivisionTest<DataType::Float32>, DataType::Float32>
1029         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1030 }
1031 
1032 // ReLu fused into Receiver Layers Float16
1033 TEST_CASE("FuseReLUIntoConvFloat16GpuAccTest")
1034 {
1035     ActivationDescriptor activationDescriptor;
1036     activationDescriptor.m_Function = ActivationFunction::ReLu;
1037 
1038     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float16>, DataType::Float16>
1039         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1040 }
1041 TEST_CASE("FuseReLUIntoDWConvFloat16GpuAccTest")
1042 {
1043     ActivationDescriptor activationDescriptor;
1044     activationDescriptor.m_Function = ActivationFunction::ReLu;
1045 
1046     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::Float16>, DataType::Float16>
1047         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1048 }
1049 TEST_CASE("FuseReLUIntoFullyConnectedFloat16GpuAccTest")
1050 {
1051     ActivationDescriptor activationDescriptor;
1052     activationDescriptor.m_Function = ActivationFunction::ReLu;
1053 
1054     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::Float16>, DataType::Float16>
1055         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1056 }
1057 TEST_CASE("FuseReLUIntoBatchNormFloat16GpuAccTest")
1058 {
1059     ActivationDescriptor activationDescriptor;
1060     activationDescriptor.m_Function = ActivationFunction::ReLu;
1061 
1062     FuseActivationIntoPreviousLayerTest<BatchNormTest<DataType::Float16>, DataType::Float16>
1063         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1064 }
1065 TEST_CASE("FuseReLUIntoMulFloat16GpuAccTest")
1066 {
1067     ActivationDescriptor activationDescriptor;
1068     activationDescriptor.m_Function = ActivationFunction::ReLu;
1069 
1070     FuseActivationIntoPreviousLayerTest<MultiplicationTest<DataType::Float16>, DataType::Float16>
1071         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1072 }
1073 TEST_CASE("FuseReLUIntoAddFloat16GpuAccTest")
1074 {
1075     ActivationDescriptor activationDescriptor;
1076     activationDescriptor.m_Function = ActivationFunction::ReLu;
1077 
1078     FuseActivationIntoPreviousLayerTest<AdditionTest<DataType::Float16>, DataType::Float16>
1079         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1080 }
1081 TEST_CASE("FuseReLUIntoSubFloat16GpuAccTest")
1082 {
1083     ActivationDescriptor activationDescriptor;
1084     activationDescriptor.m_Function = ActivationFunction::ReLu;
1085 
1086     FuseActivationIntoPreviousLayerTest<SubtractionTest<DataType::Float16>, DataType::Float16>
1087         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1088 }
1089 TEST_CASE("FuseReLUIntoDivFloat16GpuAccTest")
1090 {
1091     ActivationDescriptor activationDescriptor;
1092     activationDescriptor.m_Function = ActivationFunction::ReLu;
1093 
1094     FuseActivationIntoPreviousLayerTest<DivisionTest<DataType::Float16>, DataType::Float16>
1095         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1096 }
1097 
1098 // ReLU fused into Receiver Layers QAsymmU8
1099 TEST_CASE("FuseReLUQIntoConvAsymmU8GpuAccTest")
1100 {
1101     ActivationDescriptor activationDescriptor;
1102     activationDescriptor.m_Function = ActivationFunction::ReLu;
1103 
1104     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1105         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1106 }
1107 TEST_CASE("FuseReLUQIntoDWConvAsymmU8GpuAccTest")
1108 {
1109     ActivationDescriptor activationDescriptor;
1110     activationDescriptor.m_Function = ActivationFunction::ReLu;
1111 
1112     FuseActivationIntoPreviousLayerTest<DWConvolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1113         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1114 }
1115 TEST_CASE("FuseReLUQIntoFullyConnectedAsymmU8GpuAccTest")
1116 {
1117     ActivationDescriptor activationDescriptor;
1118     activationDescriptor.m_Function = ActivationFunction::ReLu;
1119 
1120     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1121         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1122 }
1123 
1124 // BoundedReLu fused into Receiver Layers QAsymmS8
1125 TEST_CASE("FuseBoundedReLUIntoConvQASymmS8GpuAccTest")
1126 {
1127     ActivationDescriptor activationDescriptor;
1128     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1129     activationDescriptor.m_A = 6.0f;
1130     activationDescriptor.m_B = 0.0f;
1131 
1132     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::QAsymmS8>, DataType::QAsymmS8>
1133         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1134 }
1135 TEST_CASE("FuseBoundedReLUIntoDWConvQASymmS8GpuAccTest")
1136 {
1137     ActivationDescriptor activationDescriptor;
1138     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1139     activationDescriptor.m_A = 6.0f;
1140     activationDescriptor.m_B = 0.0f;
1141 
1142     FuseActivationIntoPreviousLayerTest < DWConvolution2dTest < DataType::QAsymmS8 > , DataType::QAsymmS8 >
1143         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1144 }
1145 TEST_CASE("FuseBoundedReLUIntoFullyConnectedQASymmS8GpuAccTest")
1146 {
1147     ActivationDescriptor activationDescriptor;
1148     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1149     activationDescriptor.m_A = 6.0f;
1150     activationDescriptor.m_B = 0.0f;
1151 
1152     FuseActivationIntoPreviousLayerTest<FullyConnectedTest<DataType::QAsymmS8>, DataType::QAsymmS8>
1153         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1154 }
1155 
1156 // TanH fused into Receiver Layers Float32
1157 TEST_CASE("FuseTanHIntoConvFloat32GpuAccTest")
1158 {
1159     ActivationDescriptor activationDescriptor;
1160     activationDescriptor.m_Function = ActivationFunction::TanH;
1161 
1162     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
1163         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1164 }
1165 TEST_CASE("FuseTanHIntoMulFloat32GpuAccTest")
1166 {
1167     ActivationDescriptor activationDescriptor;
1168     activationDescriptor.m_Function = ActivationFunction::TanH;
1169 
1170     FuseActivationIntoPreviousLayerTest<MultiplicationTest<DataType::Float32>, DataType::Float32>
1171         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1172 }
1173 TEST_CASE("FuseTanHIntoAddFloat32GpuAccTest")
1174 {
1175     ActivationDescriptor activationDescriptor;
1176     activationDescriptor.m_Function = ActivationFunction::TanH;
1177 
1178     FuseActivationIntoPreviousLayerTest<AdditionTest<DataType::Float32>, DataType::Float32>
1179         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1180 }
1181 TEST_CASE("FuseTanHIntoSubFloat32GpuAccTest")
1182 {
1183     ActivationDescriptor activationDescriptor;
1184     activationDescriptor.m_Function = ActivationFunction::TanH;
1185 
1186     FuseActivationIntoPreviousLayerTest<SubtractionTest<DataType::Float32>, DataType::Float32>
1187         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1188 }
1189 TEST_CASE("FuseTanHIntoDivFloat32GpuAccTest")
1190 {
1191     ActivationDescriptor activationDescriptor;
1192     activationDescriptor.m_Function = ActivationFunction::TanH;
1193 
1194     FuseActivationIntoPreviousLayerTest<DivisionTest<DataType::Float32>, DataType::Float32>
1195         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1196 }
1197 
1198 // HardSwish fused into Receiver Layers Float32
1199 TEST_CASE("FuseHardSwishIntoConvFloat32GpuAccTest")
1200 {
1201     ActivationDescriptor activationDescriptor;
1202     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1203 
1204     FuseActivationIntoPreviousLayerTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
1205         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1206 }
1207 TEST_CASE("FuseHardSwishIntoMulFloat32GpuAccTest")
1208 {
1209     ActivationDescriptor activationDescriptor;
1210     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1211 
1212     FuseActivationIntoPreviousLayerTest<MultiplicationTest<DataType::Float32>, DataType::Float32>
1213         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1214 }
1215 TEST_CASE("FuseHardSwishIntoAddFloat32GpuAccTest")
1216 {
1217     ActivationDescriptor activationDescriptor;
1218     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1219 
1220     FuseActivationIntoPreviousLayerTest<AdditionTest<DataType::Float32>, DataType::Float32>
1221         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1222 }
1223 TEST_CASE("FuseHardSwishIntoSubFloat32GpuAccTest")
1224 {
1225     ActivationDescriptor activationDescriptor;
1226     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1227 
1228     FuseActivationIntoPreviousLayerTest<SubtractionTest<DataType::Float32>, DataType::Float32>
1229         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1230 }
1231 TEST_CASE("FuseHardSwishIntoDivFloat32GpuAccTest")
1232 {
1233     ActivationDescriptor activationDescriptor;
1234     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1235 
1236     FuseActivationIntoPreviousLayerTest<DivisionTest<DataType::Float32>, DataType::Float32>
1237         (activationDescriptor, 0.0001f, Compute::GpuAcc);
1238 }
1239 
1240 // Test that all receiver layers follow by all activation layers work, either fused or not fused
1241 TEST_CASE("LayerFollowedByActivationFloat32GpuAccTest")
1242 {
1243     ActivationDescriptor activationDescriptor;
1244     for (int i = 0; i != 12; ++i)
1245     {
1246         activationDescriptor.m_Function = static_cast<ActivationFunction>(i);
1247         activationDescriptor.m_A = 1.0f;
1248         activationDescriptor.m_B = -1.0f;
1249         if (activationDescriptor.m_Function != ActivationFunction::Elu)
1250         {
1251             CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::Float32>, DataType::Float32>
1252                 (activationDescriptor, Compute::GpuAcc)), "Convolution + Activation function " << i);
1253             CHECK_MESSAGE((FuseActivationSimpleTest<DWConvolution2dTest<DataType::Float32>, DataType::Float32>
1254                 (activationDescriptor, Compute::GpuAcc)), "DepthwiseConvolution + Activation function " << i);
1255             CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::Float32>, DataType::Float32>
1256                 (activationDescriptor, Compute::GpuAcc)), "FullyConnected + Activation function " << i);
1257             CHECK_MESSAGE((FuseActivationSimpleTest<BatchNormTest<DataType::Float32>, DataType::Float32>
1258                 (activationDescriptor, Compute::GpuAcc)), "BatchNorm + Activation function " << i);
1259             CHECK_MESSAGE((FuseActivationSimpleTest<MultiplicationTest<DataType::Float32>, DataType::Float32>
1260                 (activationDescriptor, Compute::GpuAcc)), "Multiplication + Activation function " << i);
1261             CHECK_MESSAGE((FuseActivationSimpleTest<AdditionTest<DataType::Float32>, DataType::Float32>
1262                 (activationDescriptor, Compute::GpuAcc)), "Addition + Activation function " << i);
1263             CHECK_MESSAGE((FuseActivationSimpleTest<SubtractionTest<DataType::Float32>, DataType::Float32>
1264                 (activationDescriptor, Compute::GpuAcc)), "Subtraction + Activation function " << i);
1265             CHECK_MESSAGE((FuseActivationSimpleTest<DivisionTest<DataType::Float32>, DataType::Float32>
1266                 (activationDescriptor, Compute::GpuAcc)), "Division + Activation function " << i);
1267         }
1268     }
1269 }
1270 TEST_CASE("LayerFollowedByActivationFloat16GpuAccTest")
1271 {
1272     ActivationDescriptor activationDescriptor;
1273     for (int i = 0; i != 12; ++i)
1274     {
1275         activationDescriptor.m_Function = static_cast<ActivationFunction>(i);
1276         activationDescriptor.m_A = 1.0f;
1277         activationDescriptor.m_B = -1.0f;
1278         if (activationDescriptor.m_Function != ActivationFunction::Elu)
1279         {
1280             CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::Float16>, DataType::Float16>
1281                 (activationDescriptor, Compute::GpuAcc)), "Convolution + Activation function " << i);
1282             CHECK_MESSAGE((FuseActivationSimpleTest<DWConvolution2dTest<DataType::Float16>, DataType::Float16>
1283                 (activationDescriptor, Compute::GpuAcc)), "Depthwise + Activation function " << i);
1284             CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::Float16>, DataType::Float16>
1285                 (activationDescriptor, Compute::GpuAcc)), "FullyConnected + Activation function " << i);
1286             CHECK_MESSAGE((FuseActivationSimpleTest<BatchNormTest<DataType::Float16>, DataType::Float16>
1287                 (activationDescriptor, Compute::GpuAcc)), "BatchNorm + Activation function " << i);
1288             CHECK_MESSAGE((FuseActivationSimpleTest<MultiplicationTest<DataType::Float16>, DataType::Float16>
1289                 (activationDescriptor, Compute::GpuAcc)), "Multiplication + Activation function " << i);
1290             CHECK_MESSAGE((FuseActivationSimpleTest<AdditionTest<DataType::Float16>, DataType::Float16>
1291                 (activationDescriptor, Compute::GpuAcc)), "Addition + Activation function " << i);
1292             CHECK_MESSAGE((FuseActivationSimpleTest<SubtractionTest<DataType::Float16>, DataType::Float16>
1293                 (activationDescriptor, Compute::GpuAcc)), "Subtraction + Activation function " << i);
1294             CHECK_MESSAGE((FuseActivationSimpleTest<DivisionTest<DataType::Float16>, DataType::Float16>
1295                 (activationDescriptor, Compute::GpuAcc)), "Division + Activation function " << i);
1296         }
1297     }
1298 }
1299 TEST_CASE("LayerFollowedByActivationQAsymmU8GpuAccTest")
1300 {
1301     ActivationDescriptor activationDescriptor;
1302 
1303     activationDescriptor.m_Function = ActivationFunction::Sigmoid;
1304     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1305         (activationDescriptor, Compute::GpuAcc, 1.f / 256.f, 0)), "Convolution + Activation function " <<
1306         static_cast<int>(activationDescriptor.m_Function));
1307     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1308         (activationDescriptor, Compute::GpuAcc, 1.f / 256.f, 0)), "FullyConnected + Activation function " <<
1309         static_cast<int>(activationDescriptor.m_Function));
1310 
1311     activationDescriptor.m_Function = ActivationFunction::TanH;
1312     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1313         (activationDescriptor, Compute::GpuAcc, 1.f / 128.f, 128)), "Convolution + Activation function " <<
1314         static_cast<int>(activationDescriptor.m_Function));
1315     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1316         (activationDescriptor, Compute::GpuAcc, 1.f / 128.f, 128)), "FullyConnected + Activation function " <<
1317         static_cast<int>(activationDescriptor.m_Function));
1318 
1319     activationDescriptor.m_Function = ActivationFunction::ReLu;
1320     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1321         (activationDescriptor, Compute::GpuAcc)), "Convolution + Activation function " <<
1322         static_cast<int>(activationDescriptor.m_Function));
1323     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1324         (activationDescriptor, Compute::GpuAcc)), "FullyConnected + Activation function " <<
1325         static_cast<int>(activationDescriptor.m_Function));
1326 
1327     activationDescriptor.m_Function = ActivationFunction::BoundedReLu;
1328     activationDescriptor.m_A = 1.0f;
1329     activationDescriptor.m_B = -1.0f;
1330     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1331         (activationDescriptor, Compute::GpuAcc)), "Convolution + Activation function " <<
1332         static_cast<int>(activationDescriptor.m_Function));
1333     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1334         (activationDescriptor, Compute::GpuAcc)), "FullyConnected + Activation function " <<
1335         static_cast<int>(activationDescriptor.m_Function));
1336 
1337     activationDescriptor.m_Function = ActivationFunction::HardSwish;
1338     CHECK_MESSAGE((FuseActivationSimpleTest<Convolution2dTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1339         (activationDescriptor, Compute::GpuAcc)), "Convolution + Activation function " <<
1340         static_cast<int>(activationDescriptor.m_Function));
1341     CHECK_MESSAGE((FuseActivationSimpleTest<FullyConnectedTest<DataType::QAsymmU8>, DataType::QAsymmU8>
1342         (activationDescriptor, Compute::GpuAcc)), "FullyConnected + Activation function " <<
1343         static_cast<int>(activationDescriptor.m_Function));
1344 }
1345 }
1346 #endif
1347