23 files changed, 3706 insertions, 826 deletions
diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp
index c3f4b8a1bf..ee0c584b13 100644
--- a/src/armnn/test/CreateWorkload.hpp
+++ b/src/armnn/test/CreateWorkload.hpp
@@ -22,7 +22,7 @@ namespace
 
 using namespace std;
 
-// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type
+// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type.
 template<typename Workload>
 std::unique_ptr<Workload> MakeAndCheckWorkload(Layer& layer, Graph& graph, const IWorkloadFactory& factory)
 {
@@ -30,18 +30,19 @@ std::unique_ptr<Workload> MakeAndCheckWorkload(Layer& layer, Graph& graph, const
     BOOST_TEST(workload.get() == boost::polymorphic_downcast<Workload*>(workload.get()),
                "Cannot convert to derived class");
     std::string reasonIfUnsupported;
+    layer.SetComputeDevice(factory.GetCompute());
     BOOST_TEST(factory.IsLayerSupported(layer, layer.GetDataType(), reasonIfUnsupported));
     return std::unique_ptr<Workload>(static_cast<Workload*>(workload.release()));
 }
 
-// connects two layers
+// Connects two layers.
 void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0)
 {
     from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex));
     from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo);
 }
 
-// helper function to create tensor handlers for workloads, assuming they all use the same factory
+// Helper function to create tensor handlers for workloads, assuming they all use the same factory.
 void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory)
 {
     for (auto&& layer : graph.TopologicalSort())
@@ -57,11 +58,11 @@ void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory)
 // They return the created workloads so that backend-specific checks can be performed.
 /////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename ActivationWorkload>
+template <typename ActivationWorkload, armnn::DataType DataType>
 std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                  armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     ActivationDescriptor layerDesc;
     layerDesc.m_Function = ActivationFunction::Abs;
     layerDesc.m_A        = 3.5f;
@@ -69,19 +70,19 @@ std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloa
 
     ActivationLayer* const layer = graph.AddLayer<ActivationLayer>(layerDesc, "layer");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({1, 1}, ActivationWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo tensorInfo({1, 1}, DataType);
 
     Connect(input, layer, tensorInfo);
     Connect(layer, output, tensorInfo);
 
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<ActivationWorkload>(*layer, graph, factory);
 
     ActivationQueueDescriptor queueDescriptor = workload->GetData();
@@ -91,51 +92,51 @@ std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloa
     BOOST_TEST(queueDescriptor.m_Parameters.m_B == -10.0f);
     BOOST_TEST((queueDescriptor.m_Parameters.m_Function == ActivationFunction::Abs));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename AdditionWorkload>
+template <typename AdditionWorkload, armnn::DataType DataType>
 std::unique_ptr<AdditionWorkload> CreateAdditionWorkloadTest(armnn::IWorkloadFactory& factory,
                                                              armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Layer* const layer = graph.AddLayer<AdditionLayer>("layer");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input1 = graph.AddLayer<InputLayer>(1, "input1");
     Layer* const input2 = graph.AddLayer<InputLayer>(2, "input2");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({2, 3}, AdditionWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo tensorInfo({2, 3}, DataType);
     Connect(input1, layer, tensorInfo, 0, 0);
     Connect(input2, layer, tensorInfo, 0, 1);
     Connect(layer, output, tensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<AdditionWorkload>(*layer, graph, factory);
 
     AdditionQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 2);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename BatchNormalizationFloat32Workload>
+template <typename BatchNormalizationFloat32Workload, armnn::DataType DataType>
 std::unique_ptr<BatchNormalizationFloat32Workload> CreateBatchNormalizationWorkloadTest(
     armnn::IWorkloadFactory& factory, armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     BatchNormalizationDescriptor layerDesc;
     layerDesc.m_Eps = 0.05f;
 
     BatchNormalizationLayer* const layer = graph.AddLayer<BatchNormalizationLayer>(layerDesc, "layer");
 
-    armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32);
+    armnn::TensorInfo weightInfo({3}, DataType);
     layer->m_Mean     = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
     layer->m_Variance = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
     layer->m_Beta     = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
@@ -145,37 +146,37 @@ std::unique_ptr<BatchNormalizationFloat32Workload> CreateBatchNormalizationWorkl
     layer->m_Beta->Allocate();
     layer->m_Gamma->Allocate();
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32);
+    // Connects up.
+    armnn::TensorInfo tensorInfo({2, 3, 1, 1}, DataType);
     Connect(input, layer, tensorInfo);
     Connect(layer, output, tensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<BatchNormalizationFloat32Workload>(*layer, graph, factory);
 
     BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Parameters.m_Eps == 0.05f);
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
-    BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType::Float32)));
-    BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType::Float32)));
-    BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType::Float32)));
-    BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType::Float32)));
+    BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType)));
+    BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType)));
+    BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType)));
+    BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType)));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename Convolution2dWorkload>
+template <typename Convolution2dWorkload, armnn::DataType DataType>
 std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                               armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Convolution2dDescriptor layerDesc;
     layerDesc.m_PadLeft = 3;
     layerDesc.m_PadRight = 3;
@@ -187,24 +188,22 @@ std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IW
 
     Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer");
 
-    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3},
-                                                                         Convolution2dWorkload::ms_DataType));
-    layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>
-        (TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType)));
+    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3}, DataType));
+    layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2}, GetBiasDataType(DataType)));
 
     layer->m_Weight->Allocate();
     layer->m_Bias->Allocate();
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    Connect(input, layer, TensorInfo({2, 3, 8, 16}, Convolution2dWorkload::ms_DataType));
-    Connect(layer, output, TensorInfo({2, 2, 2, 10}, Convolution2dWorkload::ms_DataType));
+    // Connecst up.
+    Connect(input, layer, TensorInfo({2, 3, 8, 16}, DataType));
+    Connect(layer, output, TensorInfo({2, 2, 2, 10}, DataType));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<Convolution2dWorkload>(*layer, graph, factory);
 
     Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
@@ -218,20 +217,123 @@ std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IW
 
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
-    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3},
-                                                                        Convolution2dWorkload::ms_DataType)));
+    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3}, DataType)));
     BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() ==
-        TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType))));
+        TensorInfo({2}, GetBiasDataType(DataType))));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename Convolution2dWorkload>
+template <typename LstmWorkload>
+std::unique_ptr<LstmWorkload> CreateLstmWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph)
+{
+    // This parameter setting is for withCifgWithPeepholeNoProjection
+    LstmDescriptor layerDesc;
+    layerDesc.m_ActivationFunc = 4;
+    layerDesc.m_ClippingThresCell = 0.0f;
+    layerDesc.m_ClippingThresProj = 0.0f;
+    layerDesc.m_CifgEnabled = true;
+    layerDesc.m_PeepholeEnabled = true;
+    layerDesc.m_ProjectionEnabled = false;
+
+    LstmLayer* const layer = graph.AddLayer<LstmLayer>(layerDesc, "layer");
+    unsigned int batchSize = 2;
+    unsigned int inputSize = 2;
+    unsigned int numUnits = 4;
+    unsigned int outputSize = 4;
+
+    layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+    layer->m_BasicParameters.m_CellBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+    layer->m_BasicParameters.m_OutputGateBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+
+    layer->m_BasicParameters.m_InputToForgetWeights->Allocate();
+    layer->m_BasicParameters.m_InputToCellWeights->Allocate();
+    layer->m_BasicParameters.m_InputToOutputWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToForgetWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToCellWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToOutputWeights->Allocate();
+    layer->m_BasicParameters.m_ForgetGateBias->Allocate();
+    layer->m_BasicParameters.m_CellBias->Allocate();
+    layer->m_BasicParameters.m_OutputGateBias->Allocate();
+
+
+    if (layerDesc.m_PeepholeEnabled)
+    {
+        layer->m_PeepholeParameters.m_CellToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_PeepholeParameters.m_CellToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_PeepholeParameters.m_CellToForgetWeights->Allocate();
+        layer->m_PeepholeParameters.m_CellToOutputWeights->Allocate();
+    }
+
+    // create input and output layers
+    Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+    Layer* const outputStateIn = graph.AddLayer<InputLayer>(1, "outputStateIn");
+    Layer* const cellStateIn = graph.AddLayer<InputLayer>(2, "cellStateIn");
+    Layer* const scratchBuffer = graph.AddLayer<OutputLayer>(0, "scratchBuffer");
+    Layer* const outputStateOut = graph.AddLayer<OutputLayer>(1, "outputStateOut");
+    Layer* const cellStateOut = graph.AddLayer<OutputLayer>(2, "cellStateOut");
+    Layer* const output = graph.AddLayer<OutputLayer>(3, "output");
+
+    // connect up
+    armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
+    if (layerDesc.m_CifgEnabled)
+    {
+        lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
+    }
+
+    Connect(input, layer, lstmTensorInfo1, 0, 0);
+    Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
+    Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2);
+    Connect(layer, scratchBuffer, lstmTensorInfoScratchBuff, 0, 0);
+    Connect(layer, outputStateOut, lstmTensorInfo3, 1, 0);
+    Connect(layer, cellStateOut, lstmTensorInfo2, 2, 0);
+    Connect(layer, output, lstmTensorInfo3, 3, 0);
+
+    CreateTensorHandles(graph, factory);
+
+    // make the workload and check it
+    auto workload = MakeAndCheckWorkload<LstmWorkload>(*layer, graph, factory);
+    LstmQueueDescriptor queueDescriptor = workload->GetData();
+    BOOST_TEST(queueDescriptor.m_Parameters.m_ActivationFunc == 4);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresCell == 0.0f);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresProj == 0.0f);
+    BOOST_TEST(queueDescriptor.m_Inputs.size() == 3);
+    BOOST_TEST(queueDescriptor.m_Outputs.size() == 4);
+
+    BOOST_TEST((queueDescriptor.m_InputToForgetWeights->GetTensorInfo() == TensorInfo({ numUnits, inputSize },
+                                                                                     DataType::Float32)));
+    BOOST_TEST((queueDescriptor.m_OutputGateBias->GetTensorInfo() == TensorInfo({ numUnits },
+                                                                                     DataType::Float32)));
+    BOOST_TEST((queueDescriptor.m_CellBias->GetTensorInfo() == TensorInfo({ numUnits }, DataType::Float32)));
+    return workload;
+}
+
+template <typename Convolution2dWorkload, armnn::DataType DataType>
 std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                        armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Convolution2dDescriptor layerDesc;
     layerDesc.m_PadLeft = 1;
     layerDesc.m_PadRight = 1;
@@ -243,26 +345,25 @@ std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(arm
 
     Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer");
 
-    float inputsQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0;
-    float outputQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0;
+    float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0;
+    float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0;
 
-    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({ 2, 3, 3, 3 },
-        Convolution2dWorkload::ms_DataType, inputsQScale));
+    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({ 2, 3, 3, 3 }, DataType, inputsQScale));
     layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>
-        (TensorInfo({2},  GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale));
+        (TensorInfo({2},  GetBiasDataType(DataType), inputsQScale));
     layer->m_Weight->Allocate();
     layer->m_Bias->Allocate();
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    Connect(input, layer, TensorInfo({2, 3, 6, 6}, Convolution2dWorkload::ms_DataType, inputsQScale));
-    Connect(layer, output, TensorInfo({2, 2, 6, 6}, Convolution2dWorkload::ms_DataType, outputQScale));
+    // Connects up.
+    Connect(input, layer, TensorInfo({2, 3, 6, 6}, DataType, inputsQScale));
+    Connect(layer, output, TensorInfo({2, 2, 6, 6}, DataType, outputQScale));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<Convolution2dWorkload>(*layer, graph, factory);
 
     Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
@@ -277,11 +378,11 @@ std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(arm
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
     BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 3, 3},
-        Convolution2dWorkload::ms_DataType, inputsQScale)));
+        DataType, inputsQScale)));
     BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo()
-                == TensorInfo({2},  GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale)));
+                == TensorInfo({2},  GetBiasDataType(DataType), inputsQScale)));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
@@ -289,7 +390,7 @@ template <typename DepthwiseConvolution2dFloat32Workload>
 std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolution2dWorkloadTest(
     armnn::IWorkloadFactory& factory, armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     DepthwiseConvolution2dDescriptor layerDesc;
     layerDesc.m_PadLeft         = 3;
     layerDesc.m_PadRight        = 3;
@@ -306,16 +407,16 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio
     layer->m_Weight->Allocate();
     layer->m_Bias->Allocate();
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
+    // Connects up.
     Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32));
     Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<DepthwiseConvolution2dFloat32Workload>(*layer, graph, factory);
 
     DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData();
@@ -332,41 +433,39 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio
     BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({3, 3, 5, 3}, DataType::Float32)));
     BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({9}, DataType::Float32)));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename FullyConnectedWorkload>
+template <typename FullyConnectedWorkload, armnn::DataType DataType>
 std::unique_ptr<FullyConnectedWorkload> CreateFullyConnectedWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                          armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     FullyConnectedDescriptor layerDesc;
     layerDesc.m_BiasEnabled = true;
     layerDesc.m_TransposeWeightMatrix = true;
 
     FullyConnectedLayer* const layer = graph.AddLayer<FullyConnectedLayer>(layerDesc, "layer");
 
-    float inputsQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0;
-    float outputQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0;
+    float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0;
+    float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0;
 
-    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20},
-        FullyConnectedWorkload::ms_DataType, inputsQScale, 0));
-    layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7},
-        GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale));
+    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20}, DataType, inputsQScale, 0));
+    layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7}, GetBiasDataType(DataType), inputsQScale));
     layer->m_Weight->Allocate();
     layer->m_Bias->Allocate();
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    Connect(input, layer, TensorInfo({3, 1, 4, 5}, FullyConnectedWorkload::ms_DataType, inputsQScale));
-    Connect(layer, output, TensorInfo({3, 7}, FullyConnectedWorkload::ms_DataType, outputQScale));
+    // Connects up.
+    Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType, inputsQScale));
+    Connect(layer, output, TensorInfo({3, 7}, DataType, outputQScale));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<FullyConnectedWorkload>(*layer, graph, factory);
 
     FullyConnectedQueueDescriptor queueDescriptor = workload->GetData();
@@ -375,50 +474,48 @@ std::unique_ptr<FullyConnectedWorkload> CreateFullyConnectedWorkloadTest(armnn::
 
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
-    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() ==
-        TensorInfo({7, 20}, FullyConnectedWorkload::ms_DataType, inputsQScale)));
-    BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() ==
-        TensorInfo({7}, GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale)));
+    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({7, 20}, DataType, inputsQScale)));
+    BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({7}, GetBiasDataType(DataType), inputsQScale)));
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename MultiplicationWorkload>
+template <typename MultiplicationWorkload, armnn::DataType DataType>
 std::unique_ptr<MultiplicationWorkload> CreateMultiplicationWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                          armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Layer* const layer = graph.AddLayer<MultiplicationLayer>("layer");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input1 = graph.AddLayer<InputLayer>(1, "input1");
     Layer* const input2 = graph.AddLayer<InputLayer>(2, "input2");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({2, 3}, MultiplicationWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo tensorInfo({2, 3}, DataType);
     Connect(input1, layer, tensorInfo, 0, 0);
     Connect(input2, layer, tensorInfo, 0, 1);
     Connect(layer, output, tensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<MultiplicationWorkload>(*layer, graph, factory);
 
     MultiplicationQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 2);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename NormalizationFloat32Workload>
+template <typename NormalizationFloat32Workload, armnn::DataType DataType>
 std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                               armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     NormalizationDescriptor layerDesc;
     layerDesc.m_NormChannelType = NormalizationAlgorithmChannel::Across;
     layerDesc.m_NormMethodType = NormalizationAlgorithmMethod::LocalBrightness;
@@ -429,16 +526,16 @@ std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(ar
 
     NormalizationLayer* layer = graph.AddLayer<NormalizationLayer>(layerDesc, "layer");
 
-    // create extra layers
+    // Creatse extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    Connect(input, layer, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32));
-    Connect(layer, output, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32));
+    // Connects up.
+    Connect(input, layer, TensorInfo({3, 5, 5, 1}, DataType));
+    Connect(layer, output, TensorInfo({3, 5, 5, 1}, DataType));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<NormalizationFloat32Workload>(*layer, graph, factory);
 
     NormalizationQueueDescriptor queueDescriptor = workload->GetData();
@@ -452,15 +549,15 @@ std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(ar
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename Pooling2dWorkload>
+template <typename Pooling2dWorkload, armnn::DataType DataType>
 std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadFactory& factory,
                                                                armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Pooling2dDescriptor layerDesc;
     layerDesc.m_PoolType = PoolingAlgorithm::Average;
     layerDesc.m_PoolWidth = 3;
@@ -475,16 +572,16 @@ std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadF
 
     Pooling2dLayer* const layer = graph.AddLayer<Pooling2dLayer>(layerDesc, "layer");
 
-    // create extra layers
+    // Create extra layers
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    Connect(input, layer, TensorInfo({3, 2, 5, 5}, Pooling2dWorkload::ms_DataType));
-    Connect(layer, output, TensorInfo({3, 2, 2, 4}, Pooling2dWorkload::ms_DataType));
+    // Connect up
+    Connect(input, layer, TensorInfo({3, 2, 5, 5}, DataType));
+    Connect(layer, output, TensorInfo({3, 2, 2, 4}, DataType));
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Make the workload and checks it
     auto workload = MakeAndCheckWorkload<Pooling2dWorkload>(*layer, graph, factory);
 
     Pooling2dQueueDescriptor queueDescriptor = workload->GetData();
@@ -502,70 +599,70 @@ std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadF
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Return so we can do extra, backend-specific tests
     return workload;
 }
 
-template <typename SoftmaxWorkload>
+template <typename SoftmaxWorkload, armnn::DataType DataType>
 std::unique_ptr<SoftmaxWorkload> CreateSoftmaxWorkloadTest(armnn::IWorkloadFactory& factory,
                                                            armnn::Graph&            graph)
 {
-    // create the layer we're testing
+    // Create the layer we're testing.
     SoftmaxDescriptor softmaxDescriptor;
     Layer* const layer = graph.AddLayer<SoftmaxLayer>(softmaxDescriptor, "layer");
 
-    // create extra layers
+    // Create extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({4, 1}, SoftmaxWorkload::ms_DataType);
+    // Connect up
+    armnn::TensorInfo tensorInfo({4, 1}, DataType);
     Connect(input, layer, tensorInfo);
     Connect(layer, output, tensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Make the workload and checks it.
     auto workload = MakeAndCheckWorkload<SoftmaxWorkload>(*layer, graph, factory);
 
     SoftmaxQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Return so we can do extra, backend-specific tests.
     return workload;
 }
 
-template<typename SplitterWorkload>
+template<typename SplitterWorkload, armnn::DataType DataType>
 std::unique_ptr<SplitterWorkload>
     CreateSplitterWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Create the layer we're testing.
     // NOTE: need three dimensions channels, height/y, width/x because the Compute
     //       library restricts subtensors to have the same x and y dimensions as
     //       their parent tensors, and therefore the origin on the x and y dimension
     //       has to be zero for any view. So we need a third dimension to split...
-    // NOTE: arguments are: number of views, number of dimensions
+    // NOTE: arguments are: number of views, number of dimensions.
     ViewsDescriptor layerDesc(3, 3);
-    // NOTE: arguments are: view, dimension, value
+    // NOTE: arguments are: view, dimension, value.
     layerDesc.SetViewOriginCoord(0, 0, 0);
     layerDesc.SetViewOriginCoord(1, 0, 1);
     layerDesc.SetViewOriginCoord(2, 0, 3);
 
     Layer* const layer = graph.AddLayer<SplitterLayer>(layerDesc, "layer");
 
-    // add extra layers
+    // Adds extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output0 = graph.AddLayer<OutputLayer>(0, "output0");
     Layer* const output1 = graph.AddLayer<OutputLayer>(1, "output1");
     Layer* const output2 = graph.AddLayer<OutputLayer>(2, "output2");
 
-    // connect up
-    armnn::TensorInfo tensorInfo({5, 7, 7}, SplitterWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo tensorInfo({5, 7, 7}, DataType);
     Connect(input, layer, tensorInfo);
 
-    armnn::TensorInfo output0Info({1, 7, 7}, SplitterWorkload::ms_DataType);
-    armnn::TensorInfo output1Info({2, 7, 7}, SplitterWorkload::ms_DataType);
-    armnn::TensorInfo output2Info({2, 7, 7}, SplitterWorkload::ms_DataType);
+    armnn::TensorInfo output0Info({1, 7, 7}, DataType);
+    armnn::TensorInfo output1Info({2, 7, 7}, DataType);
+    armnn::TensorInfo output2Info({2, 7, 7}, DataType);
 
     Connect(layer, output0, output0Info, 0, 0);
     Connect(layer, output1, output1Info, 1, 0);
@@ -573,7 +670,7 @@ std::unique_ptr<SplitterWorkload>
 
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<SplitterWorkload>(*layer, graph, factory);
 
     SplitterQueueDescriptor queueDescriptor = workload->GetData();
@@ -591,24 +688,21 @@ std::unique_ptr<SplitterWorkload>
     BOOST_TEST(queueDescriptor.m_ViewOrigins[1].m_Origin[2] == 0);
     BOOST_TEST(queueDescriptor.m_ViewOrigins[2].m_Origin[2] == 0);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads
-template<typename SplitterWorkload, typename MergerWorkload>
+/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads.
+template<typename SplitterWorkload, typename MergerWorkload, armnn::DataType DataType>
 std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>>
     CreateSplitterMergerWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph)
 {
-    static_assert(SplitterWorkload::ms_DataType == MergerWorkload::ms_DataType,
-        "Splitter and merger workloads must have the same data type");
+    armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, DataType);
 
-    armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, SplitterWorkload::ms_DataType);
+    armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, DataType);
+    armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, DataType);
 
-    armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType);
-    armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType);
-
-    //construct the  graph
+    //Constructs the graph.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
 
     armnn::ViewsDescriptor splitterViews(2);
@@ -641,12 +735,12 @@ std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>>
 
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // add connections
+    // Adds connections.
     Connect(input, splitter, inputTensorInfo, 0, 0);
     BOOST_TEST_CHECKPOINT("connect input to splitter");
-    Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up
+    Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up.
     BOOST_TEST_CHECKPOINT("connect splitter[0] to merger[1]");
-    Connect(splitter, merger, splitTensorInfo2, 1, 0); // so that the outputs are flipped round
+    Connect(splitter, merger, splitTensorInfo2, 1, 0); // So that the outputs are flipped round.
     BOOST_TEST_CHECKPOINT("connect splitter[1] to merger[0]");
     Connect(merger, output, inputTensorInfo, 0, 0);
     BOOST_TEST_CHECKPOINT("connect merger to output");
@@ -665,7 +759,7 @@ std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>>
 
 /// This function constructs a graph with a splitter with two outputs. Each of the outputs is then
 /// connected to two different activation layers
-template<typename SplitterWorkload, typename ActivationWorkload>
+template<typename SplitterWorkload, typename ActivationWorkload, armnn::DataType DataType>
 void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph,
                                  std::unique_ptr<SplitterWorkload>& wlSplitter,
                                  std::unique_ptr<ActivationWorkload>& wlActiv0_0,
@@ -673,14 +767,11 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory&
                                  std::unique_ptr<ActivationWorkload>& wlActiv1_0,
                                  std::unique_ptr<ActivationWorkload>& wlActiv1_1)
 {
-    static_assert(SplitterWorkload::ms_DataType == ActivationWorkload::ms_DataType,
-        "Splitter and activation workloads must have the same data type");
-
-    armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, SplitterWorkload::ms_DataType);
-    armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, SplitterWorkload::ms_DataType);
-    armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, SplitterWorkload::ms_DataType);
+    armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, DataType);
+    armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, DataType);
+    armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, DataType);
 
-    //construct the  graph
+    //Constructs the graph.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
 
     armnn::ViewsDescriptor splitterViews(2);
@@ -709,7 +800,7 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory&
     Layer* const output3 = graph.AddLayer<OutputLayer>(3, "output3");
     Layer* const output4 = graph.AddLayer<OutputLayer>(4, "output4");
 
-    // add connections
+    // Adds connections.
     Connect(input, splitter, inputTensorInfo, 0, 0);
     Connect(splitter, activ0_0, splitTensorInfo1, 0, 0);
     Connect(splitter, activ0_1, splitTensorInfo1, 0, 0);
@@ -737,97 +828,155 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory&
     wlActiv1_1 = std::move(workloadActiv1_1);
 }
 
-template <typename ResizeBilinearWorkload>
+template <typename ResizeBilinearWorkload, armnn::DataType DataType>
 std::unique_ptr<ResizeBilinearWorkload> CreateResizeBilinearWorkloadTest(armnn::IWorkloadFactory& factory,
     armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     TensorShape outputShape({ 2, 3, 2, 2 });
     ResizeBilinearDescriptor resizeDesc;
     resizeDesc.m_TargetWidth = outputShape[3];
     resizeDesc.m_TargetHeight = outputShape[2];
     Layer* const layer = graph.AddLayer<ResizeBilinearLayer>(resizeDesc, "layer");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkload::ms_DataType);
-    armnn::TensorInfo outputTensorInfo(outputShape, ResizeBilinearWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, DataType);
+    armnn::TensorInfo outputTensorInfo(outputShape, DataType);
     Connect(input, layer, inputTensorInfo);
     Connect(layer, output, outputTensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<ResizeBilinearWorkload>(*layer, graph, factory);
 
     ResizeBilinearQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename L2NormalizationWorkload>
+template <typename L2NormalizationWorkload, armnn::DataType DataType>
 std::unique_ptr<L2NormalizationWorkload> CreateL2NormalizationWorkloadTest(armnn::IWorkloadFactory& factory,
     armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     Layer* const layer = graph.AddLayer<L2NormalizationLayer>("l2norm");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType);
-    armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, DataType);
+    armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, DataType);
     Connect(input, layer, inputTensorInfo);
     Connect(layer, output, outputTensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<L2NormalizationWorkload>(*layer, graph, factory);
 
     L2NormalizationQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
-template <typename ReshapeWorkload>
+template <typename ReshapeWorkload, armnn::DataType DataType>
 std::unique_ptr<ReshapeWorkload> CreateReshapeWorkloadTest(armnn::IWorkloadFactory& factory,
     armnn::Graph& graph)
 {
-    // create the layer we're testing
+    // Creates the layer we're testing.
     TensorShape outputShape({ 1, 4 });
     ReshapeDescriptor reshapeDesc;
     reshapeDesc.m_TargetShape = outputShape;
     Layer* const layer = graph.AddLayer<ReshapeLayer>(reshapeDesc, "layer");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
-    armnn::TensorInfo inputTensorInfo({ 4, 1 }, ReshapeWorkload::ms_DataType);
-    armnn::TensorInfo outputTensorInfo(outputShape, ReshapeWorkload::ms_DataType);
+    // Connects up.
+    armnn::TensorInfo inputTensorInfo({ 4, 1 }, DataType);
+    armnn::TensorInfo outputTensorInfo(outputShape, DataType);
     Connect(input, layer, inputTensorInfo);
     Connect(layer, output, outputTensorInfo);
     CreateTensorHandles(graph, factory);
 
-    // make the workload and check it
+    // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<ReshapeWorkload>(*layer, graph, factory);
 
     ReshapeQueueDescriptor queueDescriptor = workload->GetData();
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
 
-    // return so we can do extra, backend-specific tests
+    // Returns so we can do extra, backend-specific tests.
+    return workload;
+}
+
+template <typename ConvertFp16ToFp32Float32Workload>
+std::unique_ptr<ConvertFp16ToFp32Float32Workload> CreateConvertFp16ToFp32WorkloadTest(
+    armnn::IWorkloadFactory& factory, armnn::Graph& graph)
+{
+    // Creates the layer we're testing.
+    ConvertFp16ToFp32Layer* const layer = graph.AddLayer<ConvertFp16ToFp32Layer>("Fp16ToFp32Converter");
+
+    // Creates extra layers.
+    Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+    Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+    // Connects up.
+    armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+    armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+    Connect(input, layer, inputTensorInfo);
+    Connect(layer, output, outputTensorInfo);
+    CreateTensorHandles(graph, factory);
+
+    // Makes the workload and checks it.
+    auto workload = MakeAndCheckWorkload<ConvertFp16ToFp32Float32Workload>(*layer, graph, factory);
+
+    ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData();
+    BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
+    BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
+
+    // Returns so we can do extra, backend-specific tests.
+    return workload;
+}
+
+template <typename ConvertFp32ToFp16Float16Workload>
+std::unique_ptr<ConvertFp32ToFp16Float16Workload> CreateConvertFp32ToFp16WorkloadTest(
+    armnn::IWorkloadFactory& factory, armnn::Graph& graph)
+{
+    // Creates the layer we're testing.
+    ConvertFp32ToFp16Layer* const layer = graph.AddLayer<ConvertFp32ToFp16Layer>("Fp32ToFp16Converter");
+
+    // Creates extra layers.
+    Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+    Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+    // Connects up.
+    armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+    armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+    Connect(input, layer, inputTensorInfo);
+    Connect(layer, output, outputTensorInfo);
+    CreateTensorHandles(graph, factory);
+
+    // Makes the workload and checks it.
+    auto workload = MakeAndCheckWorkload<ConvertFp32ToFp16Float16Workload>(*layer, graph, factory);
+
+    ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData();
+    BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
+    BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
+
+    // Returns so we can do extra, backend-specific tests.
     return workload;
 }
 
diff --git a/src/armnn/test/CreateWorkloadClNeon.hpp b/src/armnn/test/CreateWorkloadClNeon.hpp
index a41a70755f..d92111ac41 100644
--- a/src/armnn/test/CreateWorkloadClNeon.hpp
+++ b/src/armnn/test/CreateWorkloadClNeon.hpp
@@ -56,22 +56,21 @@ boost::test_tools::predicate_result CompareTensorHandleShape(IComputeTensorHandl
     return true;
 }
 
-template<template <DataType> class CopyFromCpuWorkload, template <DataType> class CopyToCpuWorkload,
-    typename IComputeTensorHandle>
+template<typename IComputeTensorHandle>
 void CreateMemCopyWorkloads(IWorkloadFactory& factory)
 {
     Graph graph;
     RefWorkloadFactory refFactory;
 
-    // create the layers we're testing
+    // Creates the layers we're testing.
     Layer* const layer1 = graph.AddLayer<MemCopyLayer>("layer1");
     Layer* const layer2 = graph.AddLayer<MemCopyLayer>("layer2");
 
-    // create extra layers
+    // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
-    // connect up
+    // Connects up.
     TensorInfo tensorInfo({2, 3}, DataType::Float32);
     Connect(input, layer1, tensorInfo);
     Connect(layer1, layer2, tensorInfo);
@@ -83,8 +82,8 @@ void CreateMemCopyWorkloads(IWorkloadFactory& factory)
     output->CreateTensorHandles(graph, refFactory);
 
     // make the workloads and check them
-    auto workload1 = MakeAndCheckWorkload<CopyFromCpuWorkload<DataType::Float32>>(*layer1, graph, factory);
-    auto workload2 = MakeAndCheckWorkload<CopyToCpuWorkload<DataType::Float32>>(*layer2, graph, refFactory);
+    auto workload1 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer1, graph, factory);
+    auto workload2 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer2, graph, refFactory);
 
     MemCopyQueueDescriptor queueDescriptor1 = workload1->GetData();
     BOOST_TEST(queueDescriptor1.m_Inputs.size() == 1);
@@ -104,4 +103,4 @@ void CreateMemCopyWorkloads(IWorkloadFactory& factory)
     BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32)));
 }
 
-}
-\ No newline at end of file
+} //namespace
+\ No newline at end of file
diff --git a/src/armnn/test/CsvReaderTest.cpp b/src/armnn/test/CsvReaderTest.cpp
new file mode 100644
index 0000000000..8df61e1fdd
--- /dev/null
+++ b/src/armnn/test/CsvReaderTest.cpp
@@ -0,0 +1,124 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include "CsvReader.hpp"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+#include <string>
+#include <boost/filesystem.hpp>
+
+using namespace armnnUtils;
+
+struct TestHelper {
+
+    TestHelper()
+    {
+        BOOST_TEST_MESSAGE("setup fixture");
+    }
+
+    ~TestHelper()
+    {
+        BOOST_TEST_MESSAGE("teardown fixture");
+        TearDown();
+    }
+
+    std::string CreateTempCsvFile()
+    {
+        std::string fileDir = boost::filesystem::temp_directory_path().c_str();
+        boost::filesystem::path p{fileDir + "/sampleFile.csv"};
+        try
+        {
+            boost::filesystem::ofstream ofs{p};
+            ofs << "airplane, bicycle , bird , \"m,o,n,k,e,y\"\n";
+            ofs << "banana, shoe, \"ice\"";
+            ofs.close();
+        } catch (std::exception &e)
+        {
+            std::cerr << "Unable to write to file at location [" << p.c_str() << "] : " << e.what() << std::endl;
+            BOOST_TEST(false);
+        }
+        return fileDir + "/sampleFile.csv";
+    }
+
+    int CheckStringsMatch(CsvRow &row, unsigned int index, std::string expectedValue)
+    {
+        return row.values.at(index).compare(expectedValue);
+    }
+
+    void TearDown()
+    {
+        RemoveCsvFile();
+    }
+
+    void RemoveCsvFile()
+    {
+        std::string fileDir = boost::filesystem::temp_directory_path().c_str();
+        std::string filePath = fileDir + "/sampleFile.csv";
+        try
+        {
+            boost::filesystem::remove(filePath);
+        }
+        catch (std::exception &e)
+        {
+            std::cerr << "Unable to delete file [" << filePath << "] : " << e.what() << std::endl;
+            BOOST_TEST(false);
+        }
+    }
+};
+
+BOOST_AUTO_TEST_SUITE(CsvReaderTest)
+
+BOOST_FIXTURE_TEST_CASE(TestParseVector, TestHelper)
+{
+    CsvReader reader;
+    std::vector<std::string> csvStrings;
+    csvStrings.reserve(2);
+    csvStrings.push_back("airplane, automobile , bird , \"c,a,t\"");
+    csvStrings.push_back("banana, shoe, \"ice\"");
+
+    std::vector<CsvRow> row = reader.ParseVector(csvStrings);
+    CsvRow row1 = row[0];
+    CsvRow row2 = row[1];
+
+    BOOST_CHECK(row.size() == 2);
+
+    BOOST_CHECK(row1.values.size() == 4);
+    BOOST_CHECK(CheckStringsMatch(row1, 0, "airplane") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 1, "automobile") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 2, "bird") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 3, "c,a,t") == 0);
+
+    BOOST_CHECK(row2.values.size() == 3);
+    BOOST_CHECK(CheckStringsMatch(row2, 0, "banana") == 0);
+    BOOST_CHECK(CheckStringsMatch(row2, 1, "shoe") == 0);
+    BOOST_CHECK(CheckStringsMatch(row2, 2, "ice") == 0);
+}
+
+BOOST_FIXTURE_TEST_CASE(TestLoadingFileFromDisk, TestHelper)
+{
+    CsvReader reader;
+    std::string theFilePath = TestHelper::CreateTempCsvFile();
+
+    std::vector<CsvRow> row = reader.ParseFile(theFilePath);
+    CsvRow row1 = row[0];
+    CsvRow row2 = row[1];
+
+    BOOST_CHECK(row.size() == 2);
+
+    BOOST_CHECK(row1.values.size() == 4);
+    BOOST_CHECK(CheckStringsMatch(row1, 0, "airplane") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 1, "bicycle") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 2, "bird") == 0);
+    BOOST_CHECK(CheckStringsMatch(row1, 3, "m,o,n,k,e,y") == 0);
+
+    BOOST_CHECK(row2.values.size() == 3);
+    BOOST_CHECK(CheckStringsMatch(row2, 0, "banana") == 0);
+    BOOST_CHECK(CheckStringsMatch(row2, 1, "shoe") == 0);
+    BOOST_CHECK(CheckStringsMatch(row2, 2, "ice") == 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+\ No newline at end of file
diff --git a/src/armnn/test/EndToEndTest.cpp b/src/armnn/test/EndToEndTest.cpp
index 5ed84d22d0..4a8a0dfd81 100644
--- a/src/armnn/test/EndToEndTest.cpp
+++ b/src/armnn/test/EndToEndTest.cpp
@@ -11,6 +11,8 @@
 #include "backends/test/QuantizeHelper.hpp"
 #include <boost/core/ignore_unused.hpp>
 
+#include <set>
+
 BOOST_AUTO_TEST_SUITE(EndToEnd)
 
 namespace
@@ -47,9 +49,10 @@ BOOST_AUTO_TEST_CASE(Unsigned8)
     using namespace armnn;
 
     // Create runtime in which test will run
-    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef));
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
 
-    // build up the structure of the network
+    // Builds up the structure of the network.
     armnn::INetworkPtr net(INetwork::Create());
 
     IConnectableLayer* input = net->AddInputLayer(0, "input");
@@ -59,7 +62,7 @@ BOOST_AUTO_TEST_CASE(Unsigned8)
     input->GetOutputSlot(0).Connect(softmax->GetInputSlot(0));
     softmax->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-    // set the tensors in the network
+    // Sets the tensors in the network.
     TensorInfo inputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8);
     inputTensorInfo.SetQuantizationOffset(100);
     inputTensorInfo.SetQuantizationScale(10000.0f);
@@ -71,17 +74,18 @@ BOOST_AUTO_TEST_CASE(Unsigned8)
     softmax->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
     // optimize the network
-    IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec());
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
 
-    // load it into the runtime
+    // Loads it into the runtime.
     NetworkId netId;
     auto error = runtime->LoadNetwork(netId, std::move(optNet));
     BOOST_TEST(error == Status::Success);
 
-    // create structures for input & output
+    // Creates structures for input & output.
     std::vector<uint8_t> inputData
     {
-        1, 10, 3, 200, 5 // some inputs - one of which is sufficiently larger than the others to saturate softmax
+        1, 10, 3, 200, 5 // Some inputs - one of which is sufficiently larger than the others to saturate softmax.
     };
     std::vector<uint8_t> outputData(5);
 
@@ -94,19 +98,19 @@ BOOST_AUTO_TEST_CASE(Unsigned8)
         {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };
 
-    // do the inference
+    // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
 
-    // check the results
+    // Checks the results.
     BOOST_TEST(outputData[0] == 0);
     BOOST_TEST(outputData[1] == 0);
     BOOST_TEST(outputData[2] == 0);
-    BOOST_TEST(outputData[3] == 255); // softmax has been saturated
+    BOOST_TEST(outputData[3] == 255); // softmax has been saturated.
     BOOST_TEST(outputData[4] == 0);
 }
 
 template <typename T>
-void ConstantUsageTest(armnn::Compute computeDevice,
+void ConstantUsageTest(const std::vector<armnn::Compute>& computeDevice,
     const armnn::TensorInfo& commonTensorInfo,
     const std::vector<T>& inputData,
     const std::vector<T>& constantData,
@@ -115,9 +119,10 @@ void ConstantUsageTest(armnn::Compute computeDevice,
     using namespace armnn;
 
     // Create runtime in which test will run
-    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(computeDevice));
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
 
-    // build up the structure of the network
+    // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
 
     IConnectableLayer* input = net->AddInputLayer(0);
@@ -129,19 +134,19 @@ void ConstantUsageTest(armnn::Compute computeDevice,
     constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-    // set the tensors in the network
+    // Sets the tensors in the network.
     input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
     constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
     add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
 
     // optimize the network
-    IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec());
+    IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());
 
-    // load it into the runtime
+    // Loads it into the runtime.
     NetworkId netId;
     runtime->LoadNetwork(netId, std::move(optNet));
 
-    // create structures for input & output
+    // Creates structures for input & output.
     std::vector<T> outputData(inputData.size());
 
     InputTensors inputTensors
@@ -153,26 +158,26 @@ void ConstantUsageTest(armnn::Compute computeDevice,
         {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };
 
-    // do the inference
+    // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
 
-    // check the results
+    // Checks the results.
     BOOST_TEST(outputData == expectedOutputData);
 }
 
-static void ConstantUsageFloat32Test(armnn::Compute computeDevice)
+static void ConstantUsageFloat32Test(const std::vector<armnn::Compute>& computeDevice)
 {
     const armnn::TensorInfo commonTensorInfo({ 2, 3 }, armnn::DataType::Float32);
 
     ConstantUsageTest(computeDevice,
         commonTensorInfo,
-        std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // input
-        std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // const input
-        std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }  // expected output
+        std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
+        std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
+        std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }  // Expected output.
     );
 }
 
-static void ConstantUsageUint8Test(armnn::Compute computeDevice)
+static void ConstantUsageUint8Test(const std::vector<armnn::Compute>& computeDevice)
 {
     armnn::TensorInfo commonTensorInfo({ 2, 3 }, armnn::DataType::QuantisedAsymm8);
 
@@ -184,46 +189,49 @@ static void ConstantUsageUint8Test(armnn::Compute computeDevice)
 
     ConstantUsageTest(computeDevice,
         commonTensorInfo,
-        QuantizedVector<uint8_t>(scale, offset, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }), // input
-        QuantizedVector<uint8_t>(scale, offset, { 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }), // const input
-        QuantizedVector<uint8_t>(scale, offset, { 7.f, 7.f, 7.f, 7.f, 7.f, 7.f })  // expected output
+        QuantizedVector<uint8_t>(scale, offset, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }), // Input.
+        QuantizedVector<uint8_t>(scale, offset, { 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }), // Const input.
+        QuantizedVector<uint8_t>(scale, offset, { 7.f, 7.f, 7.f, 7.f, 7.f, 7.f })  // Expected output.
     );
 }
 
 BOOST_AUTO_TEST_CASE(ConstantUsage_Ref_Float32)
 {
-    ConstantUsageFloat32Test(armnn::Compute::CpuRef);
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    ConstantUsageFloat32Test(backends);
 }
 
 #if ARMCOMPUTENEON_ENABLED
 BOOST_AUTO_TEST_CASE(ConstantUsage_Neon_Float32)
 {
-    ConstantUsageFloat32Test(armnn::Compute::CpuAcc);
+    ConstantUsageFloat32Test({armnn::Compute::CpuAcc});
 }
 #endif
 
 #if ARMCOMPUTECL_ENABLED
 BOOST_AUTO_TEST_CASE(ConstantUsage_Cl_Float32)
 {
-    ConstantUsageFloat32Test(armnn::Compute::GpuAcc);
+    ConstantUsageFloat32Test({armnn::Compute::GpuAcc});
 }
 #endif
 
 BOOST_AUTO_TEST_CASE(ConstantUsage_Ref_Uint8)
 {
-    ConstantUsageUint8Test(armnn::Compute::CpuRef);
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    ConstantUsageUint8Test(backends);
 }
 
 BOOST_AUTO_TEST_CASE(TrivialAdd)
 {
-    // This test was designed to match "AddTwo" in android nn/runtime/test/TestTrivialModel.cpp
+    // This test was designed to match "AddTwo" in android nn/runtime/test/TestTrivialModel.cpp.
 
     using namespace armnn;
 
     // Create runtime in which test will run
-    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef));
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
 
-    // build up the structure of the network
+    // Builds up the structure of the network.
     armnn::INetworkPtr net(INetwork::Create());
 
     IConnectableLayer* input1 = net->AddInputLayer(0);
@@ -235,20 +243,21 @@ BOOST_AUTO_TEST_CASE(TrivialAdd)
     input2->GetOutputSlot(0).Connect(add->GetInputSlot(1));
     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-    // set the tensors in the network
+    // Sets the tensors in the network.
     TensorInfo tensorInfo(TensorShape({3, 4}), DataType::Float32);
     input1->GetOutputSlot(0).SetTensorInfo(tensorInfo);
     input2->GetOutputSlot(0).SetTensorInfo(tensorInfo);
     add->GetOutputSlot(0).SetTensorInfo(tensorInfo);
 
     // optimize the network
-    IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec());
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
 
-    // load it into the runtime
+    // Loads it into the runtime.
     NetworkId netId;
     runtime->LoadNetwork(netId, std::move(optNet));
 
-    // create structures for input & output - matching android nn test
+    // Creates structures for input & output - matching android nn test.
     std::vector<float> input1Data
     {
         1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f
@@ -269,10 +278,10 @@ BOOST_AUTO_TEST_CASE(TrivialAdd)
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };
 
-    // do the inference
+    // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
 
-    // check the results
+    // Checks the results
     BOOST_TEST(outputData[0] == 101);
     BOOST_TEST(outputData[1] == 202);
     BOOST_TEST(outputData[2] == 303);
@@ -292,9 +301,10 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs)
     using namespace armnn;
 
     // Create runtime in which test will run
-    armnn::IRuntimePtr  runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef));
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr  runtime(armnn::IRuntime::Create(options));
 
-    // build up the structure of the network
+    // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
 
     IConnectableLayer* input = net->AddInputLayer(0);
@@ -331,7 +341,7 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs)
     activation2->GetOutputSlot(0).Connect(output2->GetInputSlot(0));
     activation3->GetOutputSlot(0).Connect(output3->GetInputSlot(0));
 
-    // set the tensors in the network
+    // Sets the tensors in the network.
     TensorInfo tensorInfo(TensorShape({ 10 }), DataType::Float32);
     input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
     activation1->GetOutputSlot(0).SetTensorInfo(tensorInfo);
@@ -339,13 +349,14 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs)
     activation3->GetOutputSlot(0).SetTensorInfo(tensorInfo);
 
     // optimize the network
-    IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec());
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
 
-    // load it into the runtime
+    // Loads it into the runtime.
     NetworkId netId;
     runtime->LoadNetwork(netId, std::move(optNet));
 
-    // create structures for input & output
+    // Creates structures for input & output.
     const std::vector<float> inputData{ 3.f, 5.f, 2.f, 3.f, 7.f, 0.f, -2.f, -1.f, 3.f, 3.f };
 
     std::vector<float> output1Data(inputData.size());
@@ -363,32 +374,66 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs)
         {2,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 2), output3Data.data())}
     };
 
-    // do the inference
+    // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
 
-    // check the results
+    // Checks the results.
     BOOST_TEST(output1Data == std::vector<float>({ 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, -1.f, -1.f, 1.f, 1.f })); // ReLu1
     BOOST_TEST(output2Data == std::vector<float>({ 3.f, 5.f, 2.f, 3.f, 6.f, 0.f, 0.f, 0.f, 3.f, 3.f })); // ReLu6
     BOOST_TEST(output3Data == std::vector<float>({ 3.f, 5.f, 2.f, 3.f, 5.f, 2.f, 2.f, 2.f, 3.f, 3.f })); // [2, 5]
 }
 
 #if ARMCOMPUTENEON_ENABLED
+BOOST_AUTO_TEST_CASE(FallbackToCpuRef)
+{
+    using namespace armnn;
+
+    // Create runtime in which test will run and allow fallback to CpuRef.
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc but we allow fallback to CpuRef so it shoud pass.
+    NormalizationDescriptor descriptor;
+    IConnectableLayer* pooling = net->AddNormalizationLayer(descriptor);
+
+    IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+    pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+    pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+
+    // optimize the network
+    std::vector<Compute> backends = {Compute::CpuAcc, Compute::CpuRef};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    // Load it into the runtime. It should pass.
+    NetworkId netId;
+    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success);
+}
+#endif // ARMCOMPUTENEON_ENABLED
+
 BOOST_AUTO_TEST_CASE(ErrorOnLoadNetwork)
 {
     using namespace armnn;
 
     // Create runtime in which test will run
     // Note we don't allow falling back to CpuRef if an operation (excluding inputs, outputs, etc.) isn't supported
-    armnn::IRuntime::CreationOptions options(armnn::Compute::CpuAcc);
-    options.m_UseCpuRefAsFallback = false;
-    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
 
     // build up the structure of the network
     INetworkPtr net(INetwork::Create());
 
     IConnectableLayer* input = net->AddInputLayer(0);
 
-    // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so LoadNetwork will fail.
+    // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so Optimize will return null.
     NormalizationDescriptor descriptor;
     IConnectableLayer* pooling = net->AddNormalizationLayer(descriptor);
 
@@ -401,12 +446,9 @@ BOOST_AUTO_TEST_CASE(ErrorOnLoadNetwork)
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
 
     // optimize the network
-    IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec());
-
-    // Load it into the runtime. It should fail.
-    NetworkId netId;
-    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Failure);
+    std::vector<Compute> backends = {Compute::CpuAcc};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(!optNet);
 }
-#endif // ARMCOMPUTENEON_ENABLED
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/FP16SupportTest.cpp b/src/armnn/test/FP16SupportTest.cpp
new file mode 100644
index 0000000000..cc3b60369c
--- /dev/null
+++ b/src/armnn/test/FP16SupportTest.cpp
@@ -0,0 +1,114 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "armnn/ArmNN.hpp"
+#include "armnn/Descriptors.hpp"
+#include "Graph.hpp"
+#include "armnn/IRuntime.hpp"
+#include "armnn/INetwork.hpp"
+#include "Optimizer.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include "backends/test/QuantizeHelper.hpp"
+
+#include <boost/core/ignore_unused.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <Half.hpp>
+#include <set>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Fp16Support)
+
+BOOST_AUTO_TEST_CASE(Fp16DataTypeSupport)
+{
+    Graph graph;
+
+    Layer* const inputLayer1 = graph.AddLayer<InputLayer>(1, "input1");
+    Layer* const inputLayer2 = graph.AddLayer<InputLayer>(2, "input2");
+
+    Layer* const additionLayer = graph.AddLayer<AdditionLayer>("addition");
+    Layer* const outputLayer = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    TensorInfo fp16TensorInfo({1, 2, 3, 5}, armnn::DataType::Float16);
+    inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+    inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+    additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+    inputLayer1->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+    inputLayer2->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+    additionLayer->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+
+    BOOST_CHECK(inputLayer1->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+    BOOST_CHECK(inputLayer2->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+    BOOST_CHECK(additionLayer->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+
+}
+
+BOOST_AUTO_TEST_CASE(Fp16AdditionTest)
+{
+   using namespace half_float::literal;
+   // Create runtime in which test will run
+   IRuntime::CreationOptions options;
+   IRuntimePtr  runtime(IRuntime::Create(options));
+
+   // Builds up the structure of the network.
+   INetworkPtr net(INetwork::Create());
+
+
+   IConnectableLayer* inputLayer1 = net->AddInputLayer(0);
+   IConnectableLayer* inputLayer2 = net->AddInputLayer(1);
+   IConnectableLayer* additionLayer = net->AddAdditionLayer();
+   IConnectableLayer* outputLayer = net->AddOutputLayer(0);
+
+   inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+   inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+   additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+   //change to float16
+   TensorInfo fp16TensorInfo(TensorShape({4}), DataType::Float16);
+   inputLayer1->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+   inputLayer2->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+   additionLayer->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+
+   // optimize the network
+   std::vector<Compute> backends = {Compute::GpuAcc};
+   IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+   // Loads it into the runtime.
+
+   NetworkId netId;
+   runtime->LoadNetwork(netId, std::move(optNet));
+
+   std::vector<Half> input1Data
+   {
+       1.0_h, 2.0_h, 3.0_h, 4.0_h
+   };
+
+   std::vector<Half> input2Data
+   {
+       100.0_h, 200.0_h, 300.0_h, 400.0_h
+   };
+
+   InputTensors inputTensors
+   {
+       {0,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input1Data.data())},
+       {1,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input2Data.data())}
+   };
+
+   std::vector<Half> outputData(input1Data.size());
+   OutputTensors outputTensors
+   {
+       {0,Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
+   };
+
+   // Does the inference.
+   runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+   // Checks the results.
+   BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+\ No newline at end of file
diff --git a/src/armnn/test/FloatingPointConverterTest.cpp b/src/armnn/test/FloatingPointConverterTest.cpp
new file mode 100644
index 0000000000..d936e801ef
--- /dev/null
+++ b/src/armnn/test/FloatingPointConverterTest.cpp
@@ -0,0 +1,58 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "FloatingPointConverter.hpp"
+#include "Half.hpp"
+
+#include <malloc.h>
+#include <iostream>
+#include <algorithm>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(TestFPConversion)
+
+BOOST_AUTO_TEST_CASE(TestConvertFp32ToFp16)
+{
+    using namespace half_float::literal;
+
+    float floatArray[] = { 1.0f, 2.0f, 0.5f, 3.1f, 2.4f,
+                           5.666f, 6.444f, 7.1f, 432.121f, 12.22f };
+    size_t numFloats = sizeof(floatArray) / sizeof(floatArray[0]);
+    std::vector<armnn::Half> convertedBuffer(numFloats, 0.0_h);
+
+    armnnUtils::FloatingPointConverter::ConvertFloat32To16(floatArray, numFloats, convertedBuffer.data());
+
+    for (size_t i = 0; i < numFloats; i++)
+    {
+        armnn::Half expected(floatArray[i]);
+        armnn::Half actual = convertedBuffer[i];
+        BOOST_CHECK_EQUAL(expected, actual);
+
+        float convertedHalf = actual;
+        BOOST_CHECK_CLOSE(floatArray[i], convertedHalf, 0.07);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestConvertFp16ToFp32)
+{
+    using namespace half_float::literal;
+
+    armnn::Half halfArray[] = { 1.0_h, 2.0_h, 0.5_h, 3.1_h, 2.4_h,
+                                5.666_h, 6.444_h, 7.1_h, 432.121_h, 12.22_h };
+    size_t numFloats = sizeof(halfArray) / sizeof(halfArray[0]);
+    std::vector<float> convertedBuffer(numFloats, 0.0f);
+
+    armnnUtils::FloatingPointConverter::ConvertFloat16To32(halfArray, numFloats, convertedBuffer.data());
+
+    for (size_t i = 0; i < numFloats; i++)
+    {
+        float expected(halfArray[i]);
+        float actual = convertedBuffer[i];
+        BOOST_CHECK_EQUAL(expected, actual);
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/GraphTests.cpp b/src/armnn/test/GraphTests.cpp
index 99789e4737..ccbcb8b00b 100644
--- a/src/armnn/test/GraphTests.cpp
+++ b/src/armnn/test/GraphTests.cpp
@@ -15,7 +15,7 @@
 
 #include <boost/cast.hpp>
 
-/// checks that first comes before second in the order
+/// Checks that first comes before second in the order.
 bool CheckOrder(const armnn::Graph& graph, const armnn::Layer* first, const armnn::Layer* second)
 {
     graph.Print();
@@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE(TopologicalSort)
     armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE");
     armnn::Layer* const layerD = GetFirstLayerWithName(graph, "layerD");
 
-    // simple graph which branches and rejoins
+    // Simple graph which branches and rejoins.
     //    A
     //   / \'
     //  D   E
@@ -92,7 +92,7 @@ BOOST_AUTO_TEST_CASE(TopologicalSort)
     BOOST_TEST(CheckOrder(graph, layerB, layerC));
 }
 
-BOOST_AUTO_TEST_CASE(InsertNewLayer)
+BOOST_AUTO_TEST_CASE(InsertNewLayerBefore)
 {
     armnn::Graph graph;
     armnn::TensorInfo tensorInfo({ 1, 1, 1, 1 }, armnn::DataType::Float32);
@@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer)
     layerC->GetOutputSlot(0).Connect(layerD->GetInputSlot(1));
     layerD->GetOutputSlot(0).Connect(layerO->GetInputSlot(0));
 
-    // check order is valid
+    // Checks order is valid.
     BOOST_TEST(CheckOrder(graph, layerA, layerB));
     BOOST_TEST(CheckOrder(graph, layerA, layerC));
     BOOST_TEST(CheckOrder(graph, layerB, layerD));
@@ -147,7 +147,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer)
 
     armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE");
 
-    // check order is valid
+    // Checks order is valid.
     BOOST_TEST(CheckOrder(graph, layerA, layerB));
     BOOST_TEST(CheckOrder(graph, layerA, layerC));
     BOOST_TEST(CheckOrder(graph, layerB, layerD));
@@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer)
 
     armnn::Layer* const layerF = GetFirstLayerWithName(graph, "layerF");
 
-    // check order is valid
+    // Checks order is valid.
     BOOST_TEST(CheckOrder(graph, layerA, layerB));
     BOOST_TEST(CheckOrder(graph, layerA, layerF));
     BOOST_TEST(CheckOrder(graph, layerF, layerC));
@@ -178,6 +178,93 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer)
     BOOST_TEST(CheckOrder(graph, layerE, layerD));
 }
 
+BOOST_AUTO_TEST_CASE(InsertNewLayerAfter)
+{
+    armnn::Graph graph;
+    armnn::TensorInfo tensorInfo({ 1, 1, 1, 1 }, armnn::DataType::Float32);
+
+    std::vector<armnn::Layer*> order;
+
+    armnn::ActivationDescriptor activationDefaults;
+    BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::InputLayer>(0, "layerA"));
+    BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::ActivationLayer>(activationDefaults, "layerB"));
+    BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::ActivationLayer>(activationDefaults, "layerC"));
+    BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::AdditionLayer>("layerD"));
+    BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::OutputLayer>(0, "output"));
+
+    armnn::Layer* const layerA = GetFirstLayerWithName(graph, "layerA");
+    armnn::Layer* const layerB = GetFirstLayerWithName(graph, "layerB");
+    armnn::Layer* const layerC = GetFirstLayerWithName(graph, "layerC");
+    armnn::Layer* const layerD = GetFirstLayerWithName(graph, "layerD");
+    armnn::Layer* const layerO = GetFirstLayerWithName(graph, "output");
+
+    //    A
+    //   / \'
+    //  B   C
+    //   \ /
+    //    D
+    layerA->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+    layerB->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+    layerC->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+    layerD->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+
+    layerA->GetOutputSlot(0).Connect(layerB->GetInputSlot(0));
+    layerA->GetOutputSlot(0).Connect(layerC->GetInputSlot(0));
+    layerB->GetOutputSlot(0).Connect(layerD->GetInputSlot(0));
+    layerC->GetOutputSlot(0).Connect(layerD->GetInputSlot(1));
+    layerD->GetOutputSlot(0).Connect(layerO->GetInputSlot(0));
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layerA, layerB));
+    BOOST_TEST(CheckOrder(graph, layerA, layerC));
+    BOOST_TEST(CheckOrder(graph, layerB, layerD));
+    BOOST_TEST(CheckOrder(graph, layerC, layerD));
+
+    //    A
+    //   / \'
+    //  B   C
+    //   \  |
+    //    \ E
+    //     \|
+    //      D
+    BOOST_CHECK_NO_THROW(graph.InsertNewLayer<armnn::ActivationLayer>(layerC->GetOutputSlot(),
+                                                                      activationDefaults,
+                                                                      "layerE"));
+
+    armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layerA, layerB));
+    BOOST_TEST(CheckOrder(graph, layerA, layerC));
+    BOOST_TEST(CheckOrder(graph, layerB, layerD));
+    BOOST_TEST(CheckOrder(graph, layerC, layerE));
+    BOOST_TEST(CheckOrder(graph, layerE, layerD));
+
+
+    //    A
+    //    |
+    //    F
+    //   / \'
+    //  B   C
+    //  \   |
+    //   \  E
+    //    \ /
+    //     D
+    BOOST_CHECK_NO_THROW(graph.InsertNewLayer<armnn::ActivationLayer>(layerA->GetOutputSlot(),
+                                                                      activationDefaults,
+                                                                      "layerF"));
+
+    armnn::Layer* const layerF = GetFirstLayerWithName(graph, "layerF");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layerA, layerF));
+    BOOST_TEST(CheckOrder(graph, layerF, layerB));
+    BOOST_TEST(CheckOrder(graph, layerF, layerC));
+    BOOST_TEST(CheckOrder(graph, layerB, layerD));
+    BOOST_TEST(CheckOrder(graph, layerC, layerE));
+    BOOST_TEST(CheckOrder(graph, layerE, layerD));
+}
+
 namespace
 {
     using Edge = std::pair<const armnn::Layer*, const armnn::Layer*>;
@@ -210,7 +297,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
     std::vector<Edge> origEdges = GetEdgeList(origGraph);
     std::vector<Edge> newEdges = GetEdgeList(graph);
 
-    // Adding copy layers should not produce any duplicate edges
+    // Adding copy layers should not produce any duplicate edges.
     {
         std::vector<Edge> sortedNewEdges = newEdges;
         std::sort(sortedNewEdges.begin(), sortedNewEdges.end());
@@ -219,7 +306,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
         BOOST_CHECK_MESSAGE(last == sortedNewEdges.end(), "New graph contains duplicate edges!");
     }
 
-    // Each new edge must be tested
+    // Each new edge must be tested.
     while (!newEdges.empty())
     {
         const Edge edge = std::move(newEdges.back());
@@ -251,7 +338,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
                 BOOST_TEST((srcLayer->GetComputeDevice() == dstLayer->GetComputeDevice()));
             }
 
-            // Mark edge in original graph as observed (by deleting it)
+            // Marks edge in original graph as observed (by deleting it).
             origEdges.erase(origEdges.begin() + originalEdge);
         }
         else
@@ -288,7 +375,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
             const armnn::Layer* copyLayer = srcLayerInOrigGraph ? edge.second : edge.first;
             const armnn::Layer* nonCopyLayer = srcLayerInOrigGraph ? srcLayer : dstLayer;
 
-            // Find all edges connecting the copy layer to other layers
+            // Finds all edges connecting the copy layer to other layers.
             std::vector<Edge> adjEdges;
             auto it = newEdges.begin();
             while (it != newEdges.end())
@@ -298,7 +385,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
                 {
                     adjEdges.push_back(newEdge);
 
-                    // Since the adjacent edge is immediately tested below, no need to consider it afterwards
+                    // Since the adjacent edge is immediately tested below, there is no need to consider it afterwards.
                     it = newEdges.erase(it);
                 }
                 else
@@ -315,10 +402,10 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
                 continue;
             }
 
-            // Test adjacent edges now
+            // Tests adjacent edges now.
             for (const Edge& adjEdge : adjEdges)
             {
-                // The adjacent edge must connect the copy layer to another layer
+                // The adjacent edge must connect the copy layer to another layer.
                 const armnn::Layer* adjLayer = srcLayerInOrigGraph ? adjEdge.second : adjEdge.first;
 
                 if (!adjLayer)
@@ -329,10 +416,10 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn
                     continue;
                 }
 
-                // Both layers must have different compute devices
+                // Both layers must have different compute devices.
                 BOOST_TEST((nonCopyLayer->GetComputeDevice() != adjLayer->GetComputeDevice()));
 
-                // There must exist an edge connecting both layers directly in the original graph
+                // There must exist an edge connecting both layers directly in the original graph.
                 {
                     const armnn::Layer* origEdgeN1 = srcLayerInOrigGraph ? nonCopyLayer : adjLayer;
                     const armnn::Layer* origEdgeN2 = srcLayerInOrigGraph ? adjLayer : nonCopyLayer;
@@ -434,7 +521,7 @@ BOOST_FIXTURE_TEST_CASE(AddCopyLayersSeveralTimes, CopyLayersFixture)
 {
     m_Graph.AddCopyLayers();
 
-    // Calling AddCopyLayers() several times should not change the connections
+    // Calling AddCopyLayers() several times should not change the connections.
     const std::vector<Edge> edges = GetEdgeList(m_Graph);
     for (int i = 0; i < 4; ++i)
     {
diff --git a/src/armnn/test/InstrumentTests.cpp b/src/armnn/test/InstrumentTests.cpp
new file mode 100644
index 0000000000..a219b39b0d
--- /dev/null
+++ b/src/armnn/test/InstrumentTests.cpp
@@ -0,0 +1,62 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include <boost/test/unit_test.hpp>
+
+#include "WallClockTimer.hpp"
+
+#include <chrono>
+#include <thread>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Instruments)
+
+BOOST_AUTO_TEST_CASE(WallClockTimerInMilliseconds)
+{
+    WallClockTimer wallClockTimer;
+
+    BOOST_CHECK_EQUAL(wallClockTimer.GetName(), "WallClockTimer");
+
+    // start the timer
+    wallClockTimer.Start();
+
+    // wait for 10 milliseconds
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+   // stop the timer
+    wallClockTimer.Stop();
+
+    BOOST_CHECK_EQUAL(wallClockTimer.GetMeasurements().front().m_Name, WallClockTimer::WALL_CLOCK_TIME);
+
+    // check that WallClockTimer measurement should be >= 10 milliseconds
+    BOOST_CHECK_GE(wallClockTimer.GetMeasurements().front().m_Value, std::chrono::milliseconds(10).count());
+}
+
+BOOST_AUTO_TEST_CASE(WallClockTimerInNanoseconds)
+{
+    WallClockTimer wallClockTimer;
+
+    BOOST_CHECK_EQUAL(wallClockTimer.GetName(), "WallClockTimer");
+
+    // start the timer
+    wallClockTimer.Start();
+
+    // wait for 500 nanoseconds - 0.0005 milliseconds
+    std::this_thread::sleep_for(std::chrono::nanoseconds(500));
+
+    // stop the timer
+    wallClockTimer.Stop();
+
+    BOOST_CHECK_EQUAL(wallClockTimer.GetMeasurements().front().m_Name, WallClockTimer::WALL_CLOCK_TIME);
+
+    // delta is 0.0005 milliseconds
+    const auto delta =
+        std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(std::chrono::nanoseconds(500));
+
+    // check that WallClockTimer measurement should be >= 0.0005 milliseconds
+    BOOST_CHECK_GE(wallClockTimer.GetMeasurements().front().m_Value, delta.count());
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/JsonPrinterTests.cpp b/src/armnn/test/JsonPrinterTests.cpp
new file mode 100644
index 0000000000..28cbfd61a5
--- /dev/null
+++ b/src/armnn/test/JsonPrinterTests.cpp
@@ -0,0 +1,378 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include <boost/test/unit_test.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include <stack>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#include "Profiling.hpp"
+#include "armnn/Descriptors.hpp"
+#include "armnn/IRuntime.hpp"
+#include "armnn/INetwork.hpp"
+#include "backends/test/ClContextControlFixture.hpp"
+#include "backends/ClWorkloadFactory.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(JsonPrinterTests, ClProfilingContextControlFixture)
+
+bool AreMatchingPair(const char opening, const char closing)
+{
+    return (opening == '{' && closing == '}') || (opening == '[' && closing == ']');
+}
+
+bool AreParenthesesMatching(const std::string& exp)
+{
+    std::stack<char> expStack;
+    for (size_t i = 0; i < exp.length(); ++i)
+    {
+        if (exp[i] == '{' || exp[i] == '[')
+        {
+            expStack.push(exp[i]);
+        }
+        else if (exp[i] == '}' || exp[i] == ']')
+        {
+            if (expStack.empty() || !AreMatchingPair(expStack.top(), exp[i]))
+            {
+                return false;
+            }
+            else
+            {
+                expStack.pop();
+            }
+        }
+    }
+    return expStack.empty();
+}
+
+std::vector<double> ExtractMeasurements(const std::string& exp)
+{
+    std::vector<double> numbers;
+    bool inArray = false;
+    std::string numberString;
+    for (size_t i = 0; i < exp.size(); ++i)
+    {
+        if (exp[i] == '[')
+        {
+            inArray = true;
+        }
+        else if (exp[i] == ']' && inArray)
+        {
+            try
+            {
+                boost::trim_if(numberString, boost::is_any_of("\t,\n"));
+                numbers.push_back(std::stod(numberString));
+            }
+            catch (std::invalid_argument const& e)
+            {
+                BOOST_FAIL("Could not convert measurements to double: " + numberString);
+            }
+
+            numberString.clear();
+            inArray = false;
+        }
+        else if (exp[i] == ',' && inArray)
+        {
+            try
+            {
+                boost::trim_if(numberString, boost::is_any_of("\t,\n"));
+                numbers.push_back(std::stod(numberString));
+            }
+            catch (std::invalid_argument const& e)
+            {
+                BOOST_FAIL("Could not convert measurements to double: " + numberString);
+            }
+            numberString.clear();
+        }
+        else if (exp[i] != '[' && inArray && exp[i] != ',' && exp[i] != ' ')
+        {
+            numberString += exp[i];
+        }
+    }
+    return numbers;
+}
+
+std::vector<std::string> ExtractSections(const std::string& exp)
+{
+    std::vector<std::string> sections;
+
+    std::stack<size_t> s;
+    for (size_t i = 0; i < exp.size(); i++)
+    {
+        if (exp.at(i) == '{')
+        {
+            s.push(i);
+        }
+        else if (exp.at(i) == '}')
+        {
+            size_t from = s.top();
+            s.pop();
+            sections.push_back(exp.substr(from, i - from + 1));
+        }
+    }
+
+    return sections;
+}
+
+std::string SoftmaxProfilerTestSetupHelper(const std::vector<armnn::Compute>& backends)
+{
+    using namespace armnn;
+
+    BOOST_CHECK(!backends.empty());
+
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+
+    // Create runtime in which test will run
+    IRuntime::CreationOptions options;
+    options.m_EnableGpuProfiling = backends.front() == armnn::Compute::GpuAcc;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // build up the structure of the network
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input = net->AddInputLayer(0, "input");
+    IConnectableLayer* softmax = net->AddSoftmaxLayer(SoftmaxDescriptor(), "softmax");
+    IConnectableLayer* output  = net->AddOutputLayer(0, "output");
+
+    input->GetOutputSlot(0).Connect(softmax->GetInputSlot(0));
+    softmax->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    // set the tensors in the network
+    TensorInfo inputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8);
+    inputTensorInfo.SetQuantizationOffset(100);
+    inputTensorInfo.SetQuantizationScale(10000.0f);
+    input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    TensorInfo outputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8);
+    outputTensorInfo.SetQuantizationOffset(0);
+    outputTensorInfo.SetQuantizationScale(1.0f / 256.0f);
+    softmax->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    // optimize the network
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+    if(!optNet)
+    {
+        BOOST_FAIL("Error occurred during Optimization, Optimize() returned nullptr.");
+    }
+    // load it into the runtime
+    NetworkId netId;
+    auto error = runtime->LoadNetwork(netId, std::move(optNet));
+    BOOST_TEST(error == Status::Success);
+
+    // create structures for input & output
+    std::vector<uint8_t> inputData
+        {
+            1, 10, 3, 200, 5
+            // one of inputs is sufficiently larger than the others to saturate softmax
+        };
+    std::vector<uint8_t> outputData(5);
+
+    armnn::InputTensors inputTensors
+        {
+            {0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
+        };
+    armnn::OutputTensors outputTensors
+        {
+            {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
+        };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // do the inferences
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // retrieve the Profiler.Print() output
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);
+
+    return ss.str();
+}
+
+void SoftmaxProfilerTestValidationHelper(std::string& result, const std::string& testData)
+{
+    // ensure all measurements are greater than zero
+    std::vector<double> measurementsVector = ExtractMeasurements(result);
+    BOOST_CHECK(!measurementsVector.empty());
+
+    // check sections contain raw and unit tags
+    // first ensure Parenthesis are balanced
+    if (AreParenthesesMatching(result))
+    {
+        // remove parent sections that will not have raw or unit tag
+        std::vector<std::string> sectionVector = ExtractSections(result);
+        for (size_t i = 0; i < sectionVector.size(); ++i)
+        {
+            if (boost::contains(sectionVector[i], "\"ArmNN\":")
+                || boost::contains(sectionVector[i], "\"inference_measurements\":"))
+            {
+                sectionVector.erase(sectionVector.begin() + static_cast<int>(i));
+            }
+        }
+        BOOST_CHECK(!sectionVector.empty());
+
+        BOOST_CHECK(std::all_of(sectionVector.begin(), sectionVector.end(),
+                                [](std::string i) { return boost::contains(i, "\"raw\":"); }));
+
+        BOOST_CHECK(std::all_of(sectionVector.begin(), sectionVector.end(),
+                                [](std::string i) { return boost::contains(i, "\"unit\":"); }));
+    }
+
+    // remove the time measurements as they vary from test to test
+    result.erase(std::remove_if (result.begin(),result.end(),
+                                 [](char c) { return c == '.'; }), result.end());
+    result.erase(std::remove_if (result.begin(), result.end(), &isdigit), result.end());
+    result.erase(std::remove_if (result.begin(),result.end(),
+                                 [](char c) { return c == '\t'; }), result.end());
+
+    BOOST_CHECK(boost::contains(result, "ArmNN"));
+    BOOST_CHECK(boost::contains(result, "inference_measurements"));
+    BOOST_CHECK(boost::contains(result, "layer_measurements"));
+    BOOST_CHECK_EQUAL(result, testData);
+
+    // ensure no spare parenthesis present in print output
+    BOOST_CHECK(AreParenthesesMatching(result));
+}
+
+void SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult(
+        const std::vector<armnn::Compute>& backends)
+{
+    // setup the test fixture and obtain JSON Printer result
+    std::string result = SoftmaxProfilerTestSetupHelper(backends);
+
+    std::string backend = "Ref";
+    std::string changeLine31 = "\n},\n\"CopyMemGeneric_Execute\": {";
+    std::string changeLine39 = "ms\"";
+    std::string changeLine40;
+    std::string changeLine45;
+
+    switch(backends[0]) {
+        case armnn::Compute::GpuAcc: backend = "Cl";
+            changeLine31 = ",\n\"OpenClKernelTimer/: softmax_layer_max_shift_exp_sum_quantized_serial GWS[,,]\": {";
+            changeLine39 = R"(us"
+},
+"OpenClKernelTimer/: softmax_layer_norm_quantized GWS[,,]": {
+"raw": [
+,
+,
+
+],
+"unit": "us")";
+
+            changeLine40 = R"(
+},
+"CopyMemGeneric_Execute": {
+"raw": [
+,
+,
+
+],
+"unit": "ms")";
+            changeLine45 = "}\n";
+            break;
+        case armnn::Compute::CpuAcc: backend = "Neon";
+            changeLine31 = ",\n\"NeonKernelTimer/: NEFillBorderKernel\": {";
+            changeLine39 = R"(ms"
+},
+"NeonKernelTimer/: NELogitsDMaxKernel": {
+"raw": [
+,
+,
+
+],
+"unit": "ms"
+},
+"NeonKernelTimer/: NELogitsDSoftmaxKernel": {
+"raw": [
+,
+,
+
+],
+"unit": "ms")";
+            changeLine40 = R"(
+},
+"CopyMemGeneric_Execute": {
+"raw": [
+,
+,
+
+],
+"unit": "ms")";
+            changeLine45 = "}\n";
+            break;
+        default:
+            break;
+    }
+    std::string testData = R"({
+"ArmNN": {
+"inference_measurements": {
+"raw": [
+,
+,
+
+],
+"unit": "ms",
+"layer_measurements": {
+"raw": [
+,
+,
+
+],
+"unit": "ms",
+"CopyMemGeneric_Execute": {
+"raw": [
+,
+,
+
+],
+"unit": "ms"
+},
+")" + backend + R"(SoftmaxUintWorkload_Execute": {
+"raw": [
+,
+,
+
+],
+"unit": "ms")" + changeLine31 + R"(
+"raw": [
+,
+,
+
+],
+"unit": ")" + changeLine39 + R"(
+})" + changeLine40 + R"(
+}
+}
+}
+}
+)" + changeLine45 + R"()";
+
+    // validate the JSON Printer result
+    SoftmaxProfilerTestValidationHelper(result, testData);
+}
+
+BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterCpuRefTest)
+{
+    SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::CpuRef});
+}
+
+
+#if ARMCOMPUTENEON_ENABLED
+BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterCpuAccTest)
+{
+    SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::CpuAcc});
+}
+#endif
+
+#if ARMCOMPUTECL_ENABLED
+BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterGpuAccTest)
+{
+    SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::GpuAcc});
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/NeonTimerTest.cpp b/src/armnn/test/NeonTimerTest.cpp
new file mode 100644
index 0000000000..4502756e07
--- /dev/null
+++ b/src/armnn/test/NeonTimerTest.cpp
@@ -0,0 +1,104 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonTimer.hpp"
+#include "TensorHelpers.hpp"
+
+#include "armnn/ArmNN.hpp"
+#include "armnn/Tensor.hpp"
+#include "armnn/TypesUtils.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include "backends/NeonWorkloadFactory.hpp"
+#include "backends/WorkloadInfo.hpp"
+#include "backends/WorkloadFactory.hpp"
+#include "backends/test/LayerTests.hpp"
+#include "backends/test/TensorCopyUtils.hpp"
+#include "backends/test/WorkloadTestUtils.hpp"
+
+#include <boost/test/unit_test.hpp>
+#include <cstdlib>
+#include <algorithm>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(NeonTimerInstrument)
+
+
+BOOST_AUTO_TEST_CASE(NeonTimerGetName)
+{
+    NeonTimer neonTimer;
+    BOOST_CHECK_EQUAL(neonTimer.GetName(), "NeonKernelTimer");
+}
+
+BOOST_AUTO_TEST_CASE(NeonTimerMeasure)
+{
+    NeonWorkloadFactory workloadFactory;
+
+    unsigned int inputWidth = 4000u;
+    unsigned int inputHeight = 5000u;
+    unsigned int inputChannels = 1u;
+    unsigned int inputBatchSize = 1u;
+
+    float upperBound = 1.0f;
+    float lowerBound = -1.0f;
+
+    size_t inputSize = inputWidth * inputHeight * inputChannels * inputBatchSize;
+    std::vector<float> inputData(inputSize, 0.f);
+    std::generate(inputData.begin(), inputData.end(), [](){
+        return (static_cast<float>(rand()) / static_cast<float>(RAND_MAX / 3)) + 1.f; });
+
+    unsigned int outputWidth = inputWidth;
+    unsigned int outputHeight = inputHeight;
+    unsigned int outputChannels = inputChannels;
+    unsigned int outputBatchSize = inputBatchSize;
+
+    armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth },
+        armnn::GetDataType<float>());
+
+    armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth },
+        armnn::GetDataType<float>());
+
+    LayerTestResult<float, 4> result(inputTensorInfo);
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo, inputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    // Setup bounded ReLu
+    armnn::ActivationQueueDescriptor descriptor;
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    descriptor.m_Parameters.m_Function = armnn::ActivationFunction::BoundedReLu;
+    descriptor.m_Parameters.m_A = upperBound;
+    descriptor.m_Parameters.m_B = lowerBound;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    NeonTimer neonTimer;
+    // Start the timer.
+    neonTimer.Start();
+    // Execute the workload.
+    workload->Execute();
+    // Stop the timer.
+    neonTimer.Stop();
+
+    std::vector<Measurement> measurements = neonTimer.GetMeasurements();
+
+    BOOST_CHECK_EQUAL(measurements.size(), 2);
+    BOOST_CHECK_EQUAL(measurements[0].m_Name, "NeonKernelTimer/0: NEFillBorderKernel");
+    BOOST_CHECK(measurements[0].m_Value > 0.0);
+    BOOST_CHECK_EQUAL(measurements[1].m_Name, "NeonKernelTimer/1: NEActivationLayerKernel");
+    BOOST_CHECK(measurements[1].m_Value > 0.0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/NetworkTests.cpp b/src/armnn/test/NetworkTests.cpp
new file mode 100644
index 0000000000..66fa327221
--- /dev/null
+++ b/src/armnn/test/NetworkTests.cpp
@@ -0,0 +1,968 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include <boost/test/unit_test.hpp>
+
+#include "armnn/ArmNN.hpp"
+#include "Network.hpp"
+#include "Graph.hpp"
+#include "backends/RefWorkloadFactory.hpp"
+#include "backends/ClWorkloadFactory.hpp"
+#include "backends/NeonWorkloadFactory.hpp"
+
+#include "GraphUtils.hpp"
+
+namespace
+{
+
+bool AreAllLayerInputSlotsConnected(const armnn::IConnectableLayer& layer)
+{
+    bool allConnected = true;
+    for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i)
+    {
+        const bool inputConnected = layer.GetInputSlot(i).GetConnection() != nullptr;
+        allConnected &= inputConnected;
+    }
+    return allConnected;
+}
+
+}
+
+BOOST_AUTO_TEST_SUITE(Network)
+
+BOOST_AUTO_TEST_CASE(LayerGuids)
+{
+    armnn::Network net;
+    armnn::LayerGuid inputId = net.AddInputLayer(0)->GetGuid();
+    armnn::LayerGuid addId = net.AddAdditionLayer()->GetGuid();
+    armnn::LayerGuid outputId = net.AddOutputLayer(0)->GetGuid();
+
+    BOOST_TEST(inputId != addId);
+    BOOST_TEST(addId != outputId);
+    BOOST_TEST(inputId != outputId);
+}
+
+BOOST_AUTO_TEST_CASE(SerializeToDot)
+{
+    armnn::Network net;
+
+    //Defines layers.
+    auto input = net.AddInputLayer(0);
+    auto add = net.AddAdditionLayer();
+    auto output = net.AddOutputLayer(0);
+
+    // Connects layers.
+    input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+    input->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+    add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    armnn::TensorShape shape({4});
+    armnn::TensorInfo info(shape, armnn::DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(info);
+    add->GetOutputSlot(0).SetTensorInfo(info);
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec());
+
+    std::ostringstream ss;
+    optimizedNet->SerializeToDot(ss);
+
+    auto inputId = input->GetGuid();
+    auto addId = add->GetGuid();
+    auto outputId = output->GetGuid();
+
+    std::stringstream expected;
+    expected <<
+        "digraph Optimized {\n"
+        "    node [shape=\"record\"];\n"
+        "    edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n"
+        "    " << inputId << " [label=\"{Input}\"];\n"
+        "    " << addId << " [label=\"{Addition}\"];\n"
+        "    " << outputId << " [label=\"{Output}\"];\n"
+        "    " << inputId << " -> " << addId << " [label=< [4] >];\n"
+        "    " << inputId << " -> " << addId << " [label=< [4] >];\n"
+        "    " << addId << " -> " << outputId << " [label=< [4] >];\n"
+        "}\n";
+
+    BOOST_TEST(ss.str() == expected.str());
+}
+
+BOOST_AUTO_TEST_CASE(NetworkBasic)
+{
+    armnn::Network net;
+    BOOST_TEST(net.PrintGraph() == armnn::Status::Success);
+}
+
+BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForINetwork)
+{
+    armnn::Network net;
+    armnn::INetwork& inet = net;
+    inet.AddInputLayer(0);
+    inet.AddAdditionLayer();
+    inet.AddActivationLayer(armnn::ActivationDescriptor());
+    inet.AddOutputLayer(0);
+}
+
+BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForNetwork)
+{
+    armnn::Network net;
+    net.AddInputLayer(0);
+    net.AddAdditionLayer();
+    net.AddActivationLayer(armnn::ActivationDescriptor());
+    net.AddOutputLayer(0);
+}
+
+BOOST_AUTO_TEST_CASE(NetworkModification)
+{
+    armnn::Network net;
+
+    armnn::IConnectableLayer* const inputLayer = net.AddInputLayer(0, "input layer");
+    BOOST_TEST(inputLayer);
+
+    unsigned int dims[] = { 10,1,1,1 };
+    std::vector<float> convWeightsData(10);
+    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), convWeightsData);
+
+    armnn::Convolution2dDescriptor convDesc2d;
+    armnn::IConnectableLayer* const convLayer = net.AddConvolution2dLayer(convDesc2d, weights, "conv layer");
+    BOOST_TEST(convLayer);
+
+    inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+
+    armnn::FullyConnectedDescriptor fullyConnectedDesc;
+    armnn::IConnectableLayer* const fullyConnectedLayer = net.AddFullyConnectedLayer(fullyConnectedDesc,
+                                                                                     weights,
+                                                                                     "fully connected");
+    BOOST_TEST(fullyConnectedLayer);
+
+    convLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
+
+    armnn::Pooling2dDescriptor pooling2dDesc;
+    armnn::IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(pooling2dDesc, "pooling2d");
+    BOOST_TEST(poolingLayer);
+
+    fullyConnectedLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0));
+
+    armnn::ActivationDescriptor activationDesc;
+    armnn::IConnectableLayer* const activationLayer = net.AddActivationLayer(activationDesc, "activation");
+    BOOST_TEST(activationLayer);
+
+    poolingLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
+
+    armnn::NormalizationDescriptor normalizationDesc;
+    armnn::IConnectableLayer* const normalizationLayer = net.AddNormalizationLayer(normalizationDesc, "normalization");
+    BOOST_TEST(normalizationLayer);
+
+    activationLayer->GetOutputSlot(0).Connect(normalizationLayer->GetInputSlot(0));
+
+    armnn::SoftmaxDescriptor softmaxDesc;
+    armnn::IConnectableLayer* const softmaxLayer = net.AddSoftmaxLayer(softmaxDesc, "softmax");
+    BOOST_TEST(softmaxLayer);
+
+    normalizationLayer->GetOutputSlot(0).Connect(softmaxLayer->GetInputSlot(0));
+
+    armnn::BatchNormalizationDescriptor batchNormDesc;
+
+    armnn::TensorInfo tensorInfo({ 1 }, armnn::DataType::Float32);
+    std::vector<float> data(tensorInfo.GetNumBytes() / sizeof(float));
+    armnn::ConstTensor invalidTensor(tensorInfo, data);
+
+    armnn::IConnectableLayer* const batchNormalizationLayer = net.AddBatchNormalizationLayer(batchNormDesc,
+        invalidTensor,
+        invalidTensor,
+        invalidTensor,
+        invalidTensor,
+        "batch norm");
+    BOOST_TEST(batchNormalizationLayer);
+
+    softmaxLayer->GetOutputSlot(0).Connect(batchNormalizationLayer->GetInputSlot(0));
+
+    armnn::IConnectableLayer* const additionLayer = net.AddAdditionLayer("addition");
+    BOOST_TEST(additionLayer);
+
+    batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+    batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+
+    armnn::IConnectableLayer* const multiplicationLayer = net.AddMultiplicationLayer("multiplication");
+    BOOST_TEST(multiplicationLayer);
+
+    additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(0));
+    additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(1));
+
+    armnn::IConnectableLayer* const outputLayer = net.AddOutputLayer(0, "output layer");
+    BOOST_TEST(outputLayer);
+
+    multiplicationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+    //Tests that all layers are present in the graph.
+    BOOST_TEST(net.GetGraph().GetNumLayers() == 11);
+
+    //Tests that the vertices exist and have correct names.
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "input layer"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "conv layer"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "fully connected"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "pooling2d"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "activation"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "normalization"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "softmax"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "batch norm"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "addition"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "multiplication"));
+    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "output layer"));
+
+    auto checkOneOutputToOneInputConnection = []
+        (const armnn::IConnectableLayer* const srcLayer,
+         const armnn::IConnectableLayer* const tgtLayer,
+         int expectedSrcNumInputs = 1,
+         int expectedDstNumOutputs = 1)
+        {
+            BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs);
+            BOOST_TEST(srcLayer->GetNumOutputSlots() == 1);
+            BOOST_TEST(tgtLayer->GetNumInputSlots() == 1);
+            BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs);
+
+            BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 1);
+            BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(0) == &tgtLayer->GetInputSlot(0));
+            BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(0).GetConnection());
+        };
+    auto checkOneOutputToTwoInputsConnections = []
+        (const armnn::IConnectableLayer* const srcLayer,
+         const armnn::IConnectableLayer* const tgtLayer,
+         int expectedSrcNumInputs,
+         int expectedDstNumOutputs = 1)
+        {
+            BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs);
+            BOOST_TEST(srcLayer->GetNumOutputSlots() == 1);
+            BOOST_TEST(tgtLayer->GetNumInputSlots() == 2);
+            BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs);
+
+            BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 2);
+            for (unsigned int i = 0; i < srcLayer->GetOutputSlot(0).GetNumConnections(); ++i)
+            {
+                BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(i) == &tgtLayer->GetInputSlot(i));
+                BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(i).GetConnection());
+            }
+        };
+
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*convLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*fullyConnectedLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*poolingLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*activationLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*normalizationLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*softmaxLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*batchNormalizationLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*additionLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*multiplicationLayer));
+    BOOST_TEST(AreAllLayerInputSlotsConnected(*outputLayer));
+
+    // Checks connectivity.
+    checkOneOutputToOneInputConnection(inputLayer, convLayer, 0);
+    checkOneOutputToOneInputConnection(convLayer, fullyConnectedLayer);
+    checkOneOutputToOneInputConnection(fullyConnectedLayer, poolingLayer);
+    checkOneOutputToOneInputConnection(poolingLayer, activationLayer);
+    checkOneOutputToOneInputConnection(activationLayer, normalizationLayer);
+    checkOneOutputToOneInputConnection(normalizationLayer, softmaxLayer);
+    checkOneOutputToOneInputConnection(softmaxLayer, batchNormalizationLayer);
+    checkOneOutputToTwoInputsConnections(batchNormalizationLayer, additionLayer, 1);
+    checkOneOutputToTwoInputsConnections(additionLayer, multiplicationLayer, 2);
+    checkOneOutputToOneInputConnection(multiplicationLayer, outputLayer, 2, 0);
+}
+
+BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMerger)
+{
+    armnn::Network net;
+
+    // Adds an input layer and an input tensor descriptor.
+    armnn::IConnectableLayer* inputLayer = net.AddInputLayer(0, "input layer");
+    BOOST_TEST(inputLayer);
+
+    // Adds a splitter layer.
+    armnn::ViewsDescriptor splitterDesc(2,4);
+
+    armnn::IConnectableLayer* splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
+    BOOST_TEST(splitterLayer);
+
+    inputLayer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
+
+    // Adds a softmax layer 1.
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    armnn::IConnectableLayer* softmaxLayer1 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
+    BOOST_TEST(softmaxLayer1);
+
+    splitterLayer->GetOutputSlot(0).Connect(softmaxLayer1->GetInputSlot(0));
+
+    // Adds a softmax layer 2.
+    armnn::IConnectableLayer* softmaxLayer2 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
+    BOOST_TEST(softmaxLayer2);
+
+    splitterLayer->GetOutputSlot(1).Connect(softmaxLayer2->GetInputSlot(0));
+
+    // Adds a merger layer.
+    armnn::OriginsDescriptor mergerDesc(2, 4);
+
+    armnn::IConnectableLayer* mergerLayer = net.AddMergerLayer(mergerDesc, "merger layer");
+    BOOST_TEST(mergerLayer);
+
+    softmaxLayer1->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(0));
+    softmaxLayer2->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(1));
+
+    // Adds an output layer.
+    armnn::IConnectableLayer* outputLayer = net.AddOutputLayer(0, "output layer");
+    BOOST_TEST(outputLayer);
+
+    mergerLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+    BOOST_TEST(splitterLayer->GetNumOutputSlots() == 2);
+    BOOST_TEST(splitterLayer->GetOutputSlot(0).GetConnection(0) == &softmaxLayer1->GetInputSlot(0));
+    BOOST_TEST(&splitterLayer->GetOutputSlot(0) == softmaxLayer1->GetInputSlot(0).GetConnection());
+    BOOST_TEST(splitterLayer->GetOutputSlot(1).GetConnection(0) == &softmaxLayer2->GetInputSlot(0));
+    BOOST_TEST(&splitterLayer->GetOutputSlot(1) == softmaxLayer2->GetInputSlot(0).GetConnection());
+
+    BOOST_TEST(mergerLayer->GetNumInputSlots() == 2);
+    BOOST_TEST(softmaxLayer1->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(0));
+    BOOST_TEST(&softmaxLayer1->GetOutputSlot(0) == mergerLayer->GetInputSlot(0).GetConnection());
+    BOOST_TEST(softmaxLayer2->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(1));
+    BOOST_TEST(&softmaxLayer2->GetOutputSlot(0) == mergerLayer->GetInputSlot(1).GetConnection());
+}
+
+BOOST_AUTO_TEST_CASE(NetworkModification_SplitterAddition)
+{
+    armnn::Network net;
+
+    // Adds an input layer and an input tensor descriptor.
+    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer");
+    BOOST_TEST(layer);
+
+    // Adds a splitter layer.
+    armnn::ViewsDescriptor splitterDesc(2,4);
+
+    armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
+    BOOST_TEST(splitterLayer);
+
+    layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
+
+    // Adds a softmax layer 1.
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
+    BOOST_TEST(softmax1Layer);
+
+    splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0));
+
+    // Adds a softmax layer 2.
+    armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
+    BOOST_TEST(softmax2Layer);
+
+    splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0));
+
+    // Adds addition layer.
+    layer = net.AddAdditionLayer("add layer");
+    BOOST_TEST(layer);
+
+    softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
+
+    // Adds an output layer.
+    armnn::IConnectableLayer* prevLayer = layer;
+    layer = net.AddOutputLayer(0, "output layer");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+
+    BOOST_TEST(layer);
+}
+
+BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMultiplication)
+{
+    armnn::Network net;
+
+    // Adds an input layer and an input tensor descriptor.
+    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer");
+    BOOST_TEST(layer);
+
+    // Adds a splitter layer.
+    armnn::ViewsDescriptor splitterDesc(2,4);
+    armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
+    BOOST_TEST(splitterLayer);
+
+    layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
+
+    // Adds a softmax layer 1.
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
+    BOOST_TEST(softmax1Layer);
+
+    splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0));
+
+    // Adds a softmax layer 2.
+    armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
+    BOOST_TEST(softmax2Layer);
+
+    splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0));
+
+    // Adds multiplication layer.
+    layer = net.AddMultiplicationLayer("multiplication layer");
+    BOOST_TEST(layer);
+
+    softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
+
+    // Adds an output layer.
+    armnn::IConnectableLayer* prevLayer = layer;
+    layer = net.AddOutputLayer(0, "output layer");
+    BOOST_TEST(layer);
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateCpuRefWorkloads)
+{
+    const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32);
+
+    armnn::Network  net;
+
+    armnn::NormalizationDescriptor nmDesc;
+    armnn::ActivationDescriptor acDesc;
+
+    //    in
+    //     |
+    //    nm
+    //   /  |
+    //  ac  |
+    //   \  |
+    //    ml
+    //     |
+    //    sm
+    //     |
+    //    ot
+    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in");
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm");
+
+    layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    layer = net.AddActivationLayer(acDesc, "ac");
+
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* prevLayer = layer;
+    layer = net.AddMultiplicationLayer("ml");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    layer = net.AddOutputLayer(0, "ot");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec());
+    static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph().AllocateDynamicBuffers();
+    BOOST_CHECK(optNet);
+
+    // Validates workloads.
+    armnn::RefWorkloadFactory fact;
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        BOOST_CHECK_NO_THROW(
+            layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact));
+    }
+}
+
+#if ARMCOMPUTENEON_ENABLED
+BOOST_AUTO_TEST_CASE(OptimizeValidateCpuAccDeviceSupportLayerNoFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(optNet);
+    // validate workloads
+    armnn::NeonWorkloadFactory fact;
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice());
+        BOOST_CHECK_NO_THROW(
+            layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact));
+    }
+}
+#endif // ARMCOMPUTENEON_ENABLED
+
+#if ARMCOMPUTECL_ENABLED
+BOOST_AUTO_TEST_CASE(OptimizeValidateGpuDeviceSupportLayerNoFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::GpuAcc };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(optNet);
+    // validate workloads
+    armnn::ClWorkloadFactory fact;
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        BOOST_CHECK_EQUAL(armnn::Compute::GpuAcc, layer->GetComputeDevice());
+        BOOST_CHECK_NO_THROW(
+            layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact));
+    }
+}
+#endif // ARMCOMPUTECL_ENABLED
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateDeviceNonSupportLayerNoFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so Optimize will return null.
+    armnn::NormalizationDescriptor descriptor;
+    armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
+    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+    normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(!optNet);
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateDeviceNonSupportLayerWithFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc but it allows to fallback to CpuRef.
+    armnn::NormalizationDescriptor descriptor;
+    armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
+    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+    normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc, armnn::Compute::CpuRef };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_REQUIRE(optNet);
+
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        // If NEON is enabled, Input and Output layers are supported by CpuAcc,
+        // the other layers are supported by CpuRef.
+        // If NEON is not enabled, all layers are supported by CpuRef.
+#if ARMCOMPUTENEON_ENABLED
+        if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice());
+        }
+        else if (layer->GetType() == armnn::LayerType::Normalization)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+        }
+#else
+        BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+#endif
+    }
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsUndefinedComputeDevice)
+{
+    const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32);
+
+    armnn::Network  net;
+
+    armnn::NormalizationDescriptor nmDesc;
+    armnn::ActivationDescriptor acDesc;
+
+    //    in
+    //     |
+    //    nm
+    //   /  |
+    //  ac  |
+    //   \  |
+    //    ml
+    //     |
+    //    sm
+    //     |
+    //    ot
+    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in");
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm");
+
+    layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    layer = net.AddActivationLayer(acDesc, "ac");
+
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* prevLayer = layer;
+    layer = net.AddMultiplicationLayer("ml");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    layer = net.AddOutputLayer(0, "ot");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::Undefined };
+
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(!optNet);
+
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsUndefinedComputeDeviceWithFallback)
+{
+    const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32);
+
+    armnn::Network  net;
+
+    armnn::NormalizationDescriptor nmDesc;
+    armnn::ActivationDescriptor acDesc;
+
+    //    in
+    //     |
+    //    nm
+    //   /  |
+    //  ac  |
+    //   \  |
+    //    ml
+    //     |
+    //    sm
+    //     |
+    //    ot
+    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in");
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm");
+
+    layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    layer = net.AddActivationLayer(acDesc, "ac");
+
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    armnn::IConnectableLayer* prevLayer = layer;
+    layer = net.AddMultiplicationLayer("ml");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    armnn::SoftmaxDescriptor softmaxDescriptor;
+    layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+    layer->GetOutputSlot(0).SetTensorInfo(desc);
+
+    prevLayer = layer;
+    layer = net.AddOutputLayer(0, "ot");
+
+    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::Undefined, armnn::Compute::CpuRef };
+
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(optNet);
+
+    // validate workloads
+    armnn::RefWorkloadFactory fact;
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+        BOOST_CHECK_NO_THROW(
+            layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact));
+    }
+}
+BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsDuplicateComputeDeviceWithFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc but it allows to fallback to CpuRef.
+    armnn::NormalizationDescriptor descriptor;
+    armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
+    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+    normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc,
+                                             armnn::Compute::GpuAcc,
+                                             armnn::Compute::CpuRef };
+
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_REQUIRE(optNet);
+
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        // If NEON is enabled, Input and Output layers are supported by CpuAcc,
+        // the other layers are supported by CpuRef.
+        // If only CL is enabled, Input and Output layers are supported by GpuAcc,
+        // the other layers are supported by CpuRef.
+        // If neither NEON, nor CL is enabled, all layers are supported by CpuRef.
+#if ARMCOMPUTENEON_ENABLED
+        if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice());
+        }
+        else if (layer->GetType() == armnn::LayerType::Normalization)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+        }
+#elif ARMCOMPUTECL_ENABLED
+        if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::GpuAcc, layer->GetComputeDevice());
+        }
+        else if (layer->GetType() == armnn::LayerType::Normalization)
+        {
+            BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+        }
+#else
+        BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+#endif
+    }
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsCpuRefPermuteLayer)
+{
+    // Create runtime in which test will run
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    armnn::PermuteDescriptor descriptor({0, 2, 3, 1});
+    armnn::IConnectableLayer* permute = net->AddPermuteLayer(descriptor);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(permute->GetInputSlot(0));
+    permute->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+    permute->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 4, 1, 4 }, armnn::DataType::Float32));
+
+    // optimize the network
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
+    {
+        BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice());
+    }
+}
+
+BOOST_AUTO_TEST_CASE(FP16TurboModeTestOnCpuRef)
+{
+    // Test to check when FP16 Turbo mode set
+    // it converts the FP32 network to FP16 Network
+    // add FP32ToFP16 conversion layer after the InputLayer
+    // add FP16ToFP32 conversion layer after the OutputLayer
+    // checks the other layers if they are supported in FP16
+    // if they are not put the conversion layers before and after
+    // if they are not supported in FP16 use FP32 instead
+    // if there are inverse conversion layers remove them with optimization
+    // at the moment FloorLayer is not supported in FP16 so it rolls back to FP32
+    // and inverse conversion layers are removed by the optimizer
+    armnn::Network net;
+
+    // Defines layers.
+    auto input = net.AddInputLayer(0);
+    auto floor = net.AddFloorLayer();
+    auto output = net.AddOutputLayer(0);
+
+    // Connects layers.
+    input->GetOutputSlot(0).Connect(floor->GetInputSlot(0));
+    floor->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    armnn::TensorShape shape({4});
+    armnn::TensorInfo info(shape, armnn::DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(info);
+    floor->GetOutputSlot(0).SetTensorInfo(info);
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+
+    armnn::OptimizerOptions optimizerOptions;
+    optimizerOptions.m_ReduceFp32ToFp16 = true;
+
+    armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec(),
+                                                               optimizerOptions);
+
+    std::ostringstream ss;
+    optimizedNet->SerializeToDot(ss);
+
+    auto inputId = input->GetGuid();
+    auto floorId = floor->GetGuid();
+    auto outputId = output->GetGuid();
+
+    std::stringstream expected;
+    expected <<
+             "digraph Optimized {\n"
+             "    node [shape=\"record\"];\n"
+             "    edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n"
+             "    " << inputId << " [label=\"{Input}\"];\n"
+             "    " << floorId << " [label=\"{Floor}\"];\n"
+             "    " << outputId << " [label=\"{Output}\"];\n"
+             "    " << inputId << " -> " << floorId << " [label=< [4] >];\n"
+             "    " << floorId << " -> " << outputId << " [label=< [4] >];\n"
+             "}\n";
+
+    BOOST_TEST(ss.str() == expected.str());
+}
+
+#if ARMCOMPUTECL_ENABLED
+BOOST_AUTO_TEST_CASE(FP16TurboModeTestOnGpuAcc)
+{
+    // Test to check when Fp16 Turbo mode set
+    // it converts the Fp32 network to Fp16 Network
+    // add Fp32ToFp16 conversion layer after the InputLayer
+    // add Fp16ToFp32 conversion layer after the OutputLayer
+    // checks the other layers if they are supported in Fp16
+    // if they are not put the conversion layers before and after
+    // if they are not supported in Fp16 use Fp32 instead
+    // if there are inverse conversion layers remove them with optimization
+    // at the moment FloorLayer is not supported in Fp16 so it rolls back to Fp32
+    // and inverse conversion layers are removed by the optimizer
+    armnn::Network net;
+
+    // Defines layers.
+    auto input = net.AddInputLayer(0, "input layer");
+    // ReLu1
+    armnn::ActivationDescriptor activation1Descriptor;
+    activation1Descriptor.m_Function = armnn::ActivationFunction::BoundedReLu;
+    activation1Descriptor.m_A = 1.f;
+    activation1Descriptor.m_B = -1.f;
+    auto activation = net.AddActivationLayer(activation1Descriptor, "activation layer");
+    auto output = net.AddOutputLayer(0, "output layer");
+
+    // Connects layers.
+    input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
+    activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    armnn::TensorShape shape({4});
+    armnn::TensorInfo info(shape, armnn::DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(info);
+    activation->GetOutputSlot(0).SetTensorInfo(info);
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc};
+
+    armnn::OptimizerOptions optimizerOptions;
+    optimizerOptions.m_ReduceFp32ToFp16 = true;
+
+    armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec(),
+                                                               optimizerOptions);
+
+    const armnn::Graph& graph = static_cast<armnn::OptimizedNetwork*>(optimizedNet.get())->GetGraph();
+
+    // Tests that all layers are present in the graph.
+    BOOST_TEST(graph.GetNumLayers() == 5);
+
+    // Tests that the vertices exist and have correct names.
+    BOOST_TEST(GraphHasNamedLayer(graph, "input layer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "convert_fp32_to_fp16-0-input layer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "activation layer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "convert_fp16_to_fp32-0-output layer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "output layer"));
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/Network_test.cpp b/src/armnn/test/Network_test.cpp
deleted file mode 100644
index 057caa0505..0000000000
--- a/src/armnn/test/Network_test.cpp
+++ /dev/null
@@ -1,483 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// See LICENSE file in the project root for full license information.
-//
-#include <boost/test/unit_test.hpp>
-
-#include "armnn/ArmNN.hpp"
-#include "Network.hpp"
-#include "Graph.hpp"
-#include "backends/RefWorkloadFactory.hpp"
-
-#include "GraphUtils.hpp"
-
-namespace
-{
-
-bool AreAllLayerInputSlotsConnected(const armnn::IConnectableLayer& layer)
-{
-    bool allConnected = true;
-    for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i)
-    {
-        const bool inputConnected = layer.GetInputSlot(i).GetConnection() != nullptr;
-        allConnected &= inputConnected;
-    }
-    return allConnected;
-}
-
-}
-
-BOOST_AUTO_TEST_SUITE(Network)
-
-BOOST_AUTO_TEST_CASE(LayerGuids)
-{
-    armnn::Network net;
-    armnn::LayerGuid inputId = net.AddInputLayer(0)->GetGuid();
-    armnn::LayerGuid addId = net.AddAdditionLayer()->GetGuid();
-    armnn::LayerGuid outputId = net.AddOutputLayer(0)->GetGuid();
-
-    BOOST_TEST(inputId != addId);
-    BOOST_TEST(addId != outputId);
-    BOOST_TEST(inputId != outputId);
-}
-
-BOOST_AUTO_TEST_CASE(SerializeToDot)
-{
-    armnn::Network net;
-
-    //define layers
-    auto input = net.AddInputLayer(0);
-    auto add = net.AddAdditionLayer();
-    auto output = net.AddOutputLayer(0);
-
-    // connect layers
-    input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
-    input->GetOutputSlot(0).Connect(add->GetInputSlot(1));
-    add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
-
-    armnn::TensorShape shape({4});
-    armnn::TensorInfo info(shape, armnn::DataType::Float32);
-    input->GetOutputSlot(0).SetTensorInfo(info);
-    add->GetOutputSlot(0).SetTensorInfo(info);
-
-    armnn::DeviceSpec spec;
-    spec.DefaultComputeDevice = armnn::Compute::CpuAcc;
-    armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, spec);
-
-    std::ostringstream ss;
-    optimizedNet->SerializeToDot(ss);
-
-    auto inputId = input->GetGuid();
-    auto addId = add->GetGuid();
-    auto outputId = output->GetGuid();
-
-    std::stringstream expected;
-    expected <<
-        "digraph Optimized {\n"
-        "    node [shape=\"record\"];\n"
-        "    edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n"
-        "    " << inputId << " [label=\"{Input}\"];\n"
-        "    " << addId << " [label=\"{Addition}\"];\n"
-        "    " << outputId << " [label=\"{Output}\"];\n"
-        "    " << inputId << " -> " << addId << " [label=< [4] >];\n"
-        "    " << inputId << " -> " << addId << " [label=< [4] >];\n"
-        "    " << addId << " -> " << outputId << " [label=< [4] >];\n"
-        "}\n";
-
-    BOOST_TEST(ss.str() == expected.str());
-}
-
-BOOST_AUTO_TEST_CASE(NetworkBasic)
-{
-    armnn::Network net;
-    BOOST_TEST(net.PrintGraph() == armnn::Status::Success);
-}
-
-BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForINetwork)
-{
-    armnn::Network net;
-    armnn::INetwork& inet = net;
-    inet.AddInputLayer(0);
-    inet.AddAdditionLayer();
-    inet.AddActivationLayer(armnn::ActivationDescriptor());
-    inet.AddOutputLayer(0);
-}
-
-BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForNetwork)
-{
-    armnn::Network net;
-    net.AddInputLayer(0);
-    net.AddAdditionLayer();
-    net.AddActivationLayer(armnn::ActivationDescriptor());
-    net.AddOutputLayer(0);
-}
-
-BOOST_AUTO_TEST_CASE(NetworkModification)
-{
-    armnn::Network net;
-
-    armnn::IConnectableLayer* const inputLayer = net.AddInputLayer(0, "input layer");
-    BOOST_TEST(inputLayer);
-
-    unsigned int dims[] = { 10,1,1,1 };
-    std::vector<float> convWeightsData(10);
-    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), convWeightsData);
-
-    armnn::Convolution2dDescriptor convDesc2d;
-    armnn::IConnectableLayer* const convLayer = net.AddConvolution2dLayer(convDesc2d, weights, "conv layer");
-    BOOST_TEST(convLayer);
-
-    inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
-
-    armnn::FullyConnectedDescriptor fullyConnectedDesc;
-    armnn::IConnectableLayer* const fullyConnectedLayer = net.AddFullyConnectedLayer(fullyConnectedDesc,
-                                                                                     weights,
-                                                                                     "fully connected");
-    BOOST_TEST(fullyConnectedLayer);
-
-    convLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
-
-    armnn::Pooling2dDescriptor pooling2dDesc;
-    armnn::IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(pooling2dDesc, "pooling2d");
-    BOOST_TEST(poolingLayer);
-
-    fullyConnectedLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0));
-
-    armnn::ActivationDescriptor activationDesc;
-    armnn::IConnectableLayer* const activationLayer = net.AddActivationLayer(activationDesc, "activation");
-    BOOST_TEST(activationLayer);
-
-    poolingLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
-
-    armnn::NormalizationDescriptor normalizationDesc;
-    armnn::IConnectableLayer* const normalizationLayer = net.AddNormalizationLayer(normalizationDesc, "normalization");
-    BOOST_TEST(normalizationLayer);
-
-    activationLayer->GetOutputSlot(0).Connect(normalizationLayer->GetInputSlot(0));
-
-    armnn::SoftmaxDescriptor softmaxDesc;
-    armnn::IConnectableLayer* const softmaxLayer = net.AddSoftmaxLayer(softmaxDesc, "softmax");
-    BOOST_TEST(softmaxLayer);
-
-    normalizationLayer->GetOutputSlot(0).Connect(softmaxLayer->GetInputSlot(0));
-
-    armnn::BatchNormalizationDescriptor batchNormDesc;
-
-    armnn::TensorInfo tensorInfo({ 1 }, armnn::DataType::Float32);
-    std::vector<float> data(tensorInfo.GetNumBytes() / sizeof(float));
-    armnn::ConstTensor invalidTensor(tensorInfo, data);
-
-    armnn::IConnectableLayer* const batchNormalizationLayer = net.AddBatchNormalizationLayer(batchNormDesc,
-        invalidTensor,
-        invalidTensor,
-        invalidTensor,
-        invalidTensor,
-        "batch norm");
-    BOOST_TEST(batchNormalizationLayer);
-
-    softmaxLayer->GetOutputSlot(0).Connect(batchNormalizationLayer->GetInputSlot(0));
-
-    armnn::IConnectableLayer* const additionLayer = net.AddAdditionLayer("addition");
-    BOOST_TEST(additionLayer);
-
-    batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
-    batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
-
-    armnn::IConnectableLayer* const multiplicationLayer = net.AddMultiplicationLayer("multiplication");
-    BOOST_TEST(multiplicationLayer);
-
-    additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(0));
-    additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(1));
-
-    armnn::IConnectableLayer* const outputLayer = net.AddOutputLayer(0, "output layer");
-    BOOST_TEST(outputLayer);
-
-    multiplicationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
-
-    //Test that all layers are present in the graph
-    BOOST_TEST(net.GetGraph().GetNumLayers() == 11);
-
-    //Test that the vertices exist and have correct names
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "input layer"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "conv layer"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "fully connected"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "pooling2d"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "activation"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "normalization"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "softmax"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "batch norm"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "addition"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "multiplication"));
-    BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "output layer"));
-
-    auto checkOneOutputToOneInputConnection = []
-        (const armnn::IConnectableLayer* const srcLayer,
-         const armnn::IConnectableLayer* const tgtLayer,
-         int expectedSrcNumInputs = 1,
-         int expectedDstNumOutputs = 1)
-        {
-            BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs);
-            BOOST_TEST(srcLayer->GetNumOutputSlots() == 1);
-            BOOST_TEST(tgtLayer->GetNumInputSlots() == 1);
-            BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs);
-
-            BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 1);
-            BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(0) == &tgtLayer->GetInputSlot(0));
-            BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(0).GetConnection());
-        };
-    auto checkOneOutputToTwoInputsConnections = []
-        (const armnn::IConnectableLayer* const srcLayer,
-         const armnn::IConnectableLayer* const tgtLayer,
-         int expectedSrcNumInputs,
-         int expectedDstNumOutputs = 1)
-        {
-            BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs);
-            BOOST_TEST(srcLayer->GetNumOutputSlots() == 1);
-            BOOST_TEST(tgtLayer->GetNumInputSlots() == 2);
-            BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs);
-
-            BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 2);
-            for (unsigned int i = 0; i < srcLayer->GetOutputSlot(0).GetNumConnections(); ++i)
-            {
-                BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(i) == &tgtLayer->GetInputSlot(i));
-                BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(i).GetConnection());
-            }
-        };
-
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*convLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*fullyConnectedLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*poolingLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*activationLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*normalizationLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*softmaxLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*batchNormalizationLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*additionLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*multiplicationLayer));
-    BOOST_TEST(AreAllLayerInputSlotsConnected(*outputLayer));
-
-    // Check connectivity
-    checkOneOutputToOneInputConnection(inputLayer, convLayer, 0);
-    checkOneOutputToOneInputConnection(convLayer, fullyConnectedLayer);
-    checkOneOutputToOneInputConnection(fullyConnectedLayer, poolingLayer);
-    checkOneOutputToOneInputConnection(poolingLayer, activationLayer);
-    checkOneOutputToOneInputConnection(activationLayer, normalizationLayer);
-    checkOneOutputToOneInputConnection(normalizationLayer, softmaxLayer);
-    checkOneOutputToOneInputConnection(softmaxLayer, batchNormalizationLayer);
-    checkOneOutputToTwoInputsConnections(batchNormalizationLayer, additionLayer, 1);
-    checkOneOutputToTwoInputsConnections(additionLayer, multiplicationLayer, 2);
-    checkOneOutputToOneInputConnection(multiplicationLayer, outputLayer, 2, 0);
-}
-
-BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMerger)
-{
-    armnn::Network net;
-
-    // Add an input layer and an input tensor descriptor.
-    armnn::IConnectableLayer* inputLayer = net.AddInputLayer(0, "input layer");
-    BOOST_TEST(inputLayer);
-
-    // Add a splitter layer
-    armnn::ViewsDescriptor splitterDesc(2,4);
-
-    armnn::IConnectableLayer* splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
-    BOOST_TEST(splitterLayer);
-
-    inputLayer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
-
-    // Add a softmax layer 1
-    armnn::SoftmaxDescriptor softmaxDescriptor;
-    armnn::IConnectableLayer* softmaxLayer1 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
-    BOOST_TEST(softmaxLayer1);
-
-    splitterLayer->GetOutputSlot(0).Connect(softmaxLayer1->GetInputSlot(0));
-
-    // Add a softmax layer 2
-    armnn::IConnectableLayer* softmaxLayer2 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
-    BOOST_TEST(softmaxLayer2);
-
-    splitterLayer->GetOutputSlot(1).Connect(softmaxLayer2->GetInputSlot(0));
-
-    // Add a merger layer
-    armnn::OriginsDescriptor mergerDesc(2, 4);
-
-    armnn::IConnectableLayer* mergerLayer = net.AddMergerLayer(mergerDesc, "merger layer");
-    BOOST_TEST(mergerLayer);
-
-    softmaxLayer1->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(0));
-    softmaxLayer2->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(1));
-
-    // Add an output layer
-    armnn::IConnectableLayer* outputLayer = net.AddOutputLayer(0, "output layer");
-    BOOST_TEST(outputLayer);
-
-    mergerLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
-
-    BOOST_TEST(splitterLayer->GetNumOutputSlots() == 2);
-    BOOST_TEST(splitterLayer->GetOutputSlot(0).GetConnection(0) == &softmaxLayer1->GetInputSlot(0));
-    BOOST_TEST(&splitterLayer->GetOutputSlot(0) == softmaxLayer1->GetInputSlot(0).GetConnection());
-    BOOST_TEST(splitterLayer->GetOutputSlot(1).GetConnection(0) == &softmaxLayer2->GetInputSlot(0));
-    BOOST_TEST(&splitterLayer->GetOutputSlot(1) == softmaxLayer2->GetInputSlot(0).GetConnection());
-
-    BOOST_TEST(mergerLayer->GetNumInputSlots() == 2);
-    BOOST_TEST(softmaxLayer1->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(0));
-    BOOST_TEST(&softmaxLayer1->GetOutputSlot(0) == mergerLayer->GetInputSlot(0).GetConnection());
-    BOOST_TEST(softmaxLayer2->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(1));
-    BOOST_TEST(&softmaxLayer2->GetOutputSlot(0) == mergerLayer->GetInputSlot(1).GetConnection());
-}
-
-BOOST_AUTO_TEST_CASE(NetworkModification_SplitterAddition)
-{
-    armnn::Network net;
-
-    // Add an input layer and an input tensor descriptor.
-    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer");
-    BOOST_TEST(layer);
-
-    // Add a splitter layer
-    armnn::ViewsDescriptor splitterDesc(2,4);
-
-    armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
-    BOOST_TEST(splitterLayer);
-
-    layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
-
-    // Add a softmax layer 1
-    armnn::SoftmaxDescriptor softmaxDescriptor;
-    armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
-    BOOST_TEST(softmax1Layer);
-
-    splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0));
-
-    // Add a softmax layer 2
-    armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
-    BOOST_TEST(softmax2Layer);
-
-    splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0));
-
-    // Add addition layer
-    layer = net.AddAdditionLayer("add layer");
-    BOOST_TEST(layer);
-
-    softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-    softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
-
-    // Add an output layer
-    armnn::IConnectableLayer* prevLayer = layer;
-    layer = net.AddOutputLayer(0, "output layer");
-
-    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-
-    BOOST_TEST(layer);
-}
-
-BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMultiplication)
-{
-    armnn::Network net;
-
-    // Add an input layer and an input tensor descriptor.
-    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer");
-    BOOST_TEST(layer);
-
-    // Add a splitter layer
-    armnn::ViewsDescriptor splitterDesc(2,4);
-    armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer");
-    BOOST_TEST(splitterLayer);
-
-    layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0));
-
-    // Add a softmax layer 1
-    armnn::SoftmaxDescriptor softmaxDescriptor;
-    armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1");
-    BOOST_TEST(softmax1Layer);
-
-    splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0));
-
-    // Add a softmax layer 2
-    armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2");
-    BOOST_TEST(softmax2Layer);
-
-    splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0));
-
-    // Add multiplication layer
-    layer = net.AddMultiplicationLayer("multiplication layer");
-    BOOST_TEST(layer);
-
-    softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-    softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
-
-    // Add an output layer
-    armnn::IConnectableLayer* prevLayer = layer;
-    layer = net.AddOutputLayer(0, "output layer");
-    BOOST_TEST(layer);
-
-    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-}
-
-BOOST_AUTO_TEST_CASE(ValidateWorkloads)
-{
-    const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32);
-
-    armnn::Network  net;
-
-    armnn::NormalizationDescriptor nmDesc;
-    armnn::ActivationDescriptor acDesc;
-
-    //    in
-    //     |
-    //    nm
-    //   /  |
-    //  ac  |
-    //   \  |
-    //    ml
-    //     |
-    //    sm
-    //     |
-    //    ot
-    armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in");
-    layer->GetOutputSlot(0).SetTensorInfo(desc);
-
-    armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm");
-
-    layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0));
-    normLayer->GetOutputSlot(0).SetTensorInfo(desc);
-
-    layer = net.AddActivationLayer(acDesc, "ac");
-
-    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-    layer->GetOutputSlot(0).SetTensorInfo(desc);
-
-    armnn::IConnectableLayer* prevLayer = layer;
-    layer = net.AddMultiplicationLayer("ml");
-
-    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-    normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1));
-    layer->GetOutputSlot(0).SetTensorInfo(desc);
-
-    prevLayer = layer;
-    armnn::SoftmaxDescriptor softmaxDescriptor;
-    layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm");
-
-    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-    layer->GetOutputSlot(0).SetTensorInfo(desc);
-
-    prevLayer = layer;
-    layer = net.AddOutputLayer(0, "ot");
-
-    prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
-
-    armnn::DeviceSpec spec;
-    spec.DefaultComputeDevice = armnn::Compute::CpuRef;
-
-    armnn::IOptimizedNetworkPtr optNet = Optimize(net, spec);
-    static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph().AllocateDynamicBuffers();
-
-    // validate workloads
-    armnn::RefWorkloadFactory fact;
-    for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph())
-    {
-        BOOST_CHECK_NO_THROW(
-            layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact));
-    }
-}
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/ObservableTest.cpp b/src/armnn/test/ObservableTest.cpp
new file mode 100644
index 0000000000..6588f3469e
--- /dev/null
+++ b/src/armnn/test/ObservableTest.cpp
@@ -0,0 +1,94 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include <boost/test/unit_test.hpp>
+
+#include "Graph.hpp"
+#include "Observable.hpp"
+
+BOOST_AUTO_TEST_SUITE(Observable)
+
+BOOST_AUTO_TEST_CASE(AddedLayerObservableTest)
+{
+    armnn::Graph graph;
+
+    // Create a graph observable
+    armnn::AddedLayerObservable layerObservable(graph);
+
+    // Add a few layers
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+    auto input = graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    // Check the observable has observed the changes
+    std::list<armnn::Layer*> testLayers({ output, input });
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(layerObservable.begin(), layerObservable.end(),
+                                  testLayers.begin(), testLayers.end());
+}
+
+BOOST_AUTO_TEST_CASE(ClearAddedLayerObservableTest)
+{
+    armnn::Graph graph;
+
+    // Create a graph observable
+    armnn::AddedLayerObservable addedLayerObservable(graph);
+
+    // Add a few layers
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    addedLayerObservable.Clear();
+
+    // Check the observable has observed the changes
+    std::list<armnn::Layer*> emptyList({});
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(addedLayerObservable.begin(), addedLayerObservable.end(),
+                                  emptyList.begin(), emptyList.end());
+}
+
+BOOST_AUTO_TEST_CASE(ErasedLayerNamesObservableTest)
+{
+    armnn::Graph graph;
+
+    // Create a graph observable
+    armnn::ErasedLayerNamesObservable erasedLayerNamesObservable(graph);
+
+    // Add a few layers
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    graph.EraseLayer(output);
+
+    // Check the observable has observed the changes
+    std::list<std::string> testList({"output"});
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(erasedLayerNamesObservable.begin(), erasedLayerNamesObservable.end(),
+                                  testList.begin(), testList.end());
+}
+
+BOOST_AUTO_TEST_CASE(ClearErasedLayerNamesObservableTest)
+{
+    armnn::Graph graph;
+
+    // Create a graph observable
+    armnn::ErasedLayerNamesObservable erasedLayerNamesObservable(graph);
+
+    // Add a few layers
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    graph.EraseLayer(output);
+
+    erasedLayerNamesObservable.Clear();
+
+    // Check the observable has observed the changes
+    std::list<std::string> emptyList({});
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(erasedLayerNamesObservable.begin(), erasedLayerNamesObservable.end(),
+                                  emptyList.begin(), emptyList.end());
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
diff --git a/src/armnn/test/OpenClTimerTest.cpp b/src/armnn/test/OpenClTimerTest.cpp
new file mode 100644
index 0000000000..b8dea8ebe0
--- /dev/null
+++ b/src/armnn/test/OpenClTimerTest.cpp
@@ -0,0 +1,137 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#if (defined(__aarch64__)) || (defined(__x86_64__)) // disable test failing on FireFly/Armv7
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "backends/ClContextControl.hpp"
+#include "backends/ClWorkloadFactory.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include <boost/format.hpp>
+#include <iostream>
+#include "OpenClTimer.hpp"
+#include "backends/test/TensorCopyUtils.hpp"
+#include "TensorHelpers.hpp"
+#include <boost/test/unit_test.hpp>
+#include "backends/WorkloadFactory.hpp"
+#include "backends/test/WorkloadTestUtils.hpp"
+
+using namespace armnn;
+
+struct OpenClFixture
+{
+    // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case.
+    // NOTE: Profiling needs to be enabled in ClContextControl to be able to obtain execution
+    // times from OpenClTimer.
+    OpenClFixture() : m_ClContextControl(nullptr, true) {}
+    ~OpenClFixture() {}
+
+    ClContextControl m_ClContextControl;
+};
+
+BOOST_FIXTURE_TEST_SUITE(OpenClTimerBatchNorm, OpenClFixture)
+using FactoryType = ClWorkloadFactory;
+
+BOOST_AUTO_TEST_CASE(OpenClTimerBatchNorm)
+{
+    ClWorkloadFactory  workloadFactory;
+
+    const unsigned int width    = 2;
+    const unsigned int height   = 3;
+    const unsigned int channels = 2;
+    const unsigned int num      = 1;
+    int32_t qOffset = 0;
+    float qScale = 0.f;
+
+    TensorInfo inputTensorInfo({num, channels, height, width}, GetDataType<float>());
+    TensorInfo outputTensorInfo({num, channels, height, width}, GetDataType<float>());
+    TensorInfo tensorInfo({channels}, GetDataType<float>());
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(IsQuantizedType<float>())
+    {
+         inputTensorInfo.SetQuantizationScale(qScale);
+         inputTensorInfo.SetQuantizationOffset(qOffset);
+         outputTensorInfo.SetQuantizationScale(qScale);
+         outputTensorInfo.SetQuantizationOffset(qOffset);
+         tensorInfo.SetQuantizationScale(qScale);
+         tensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo,
+    QuantizedVector<float>(qScale, qOffset,
+    {
+        1.f, 4.f,
+        4.f, 2.f,
+        1.f, 6.f,
+
+        1.f, 1.f,
+        4.f, 1.f,
+        -2.f, 4.f
+    }));
+    // these values are per-channel of the input
+    auto mean     = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, -2}));
+    auto variance = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {4, 9}));
+    auto beta     = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, 2}));
+    auto gamma    = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {2, 1}));
+
+    std::unique_ptr<ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    BatchNormalizationQueueDescriptor data;
+    WorkloadInfo info;
+    ScopedCpuTensorHandle meanTensor(tensorInfo);
+    ScopedCpuTensorHandle varianceTensor(tensorInfo);
+    ScopedCpuTensorHandle betaTensor(tensorInfo);
+    ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+    AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+    AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+    AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+    AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Mean             = &meanTensor;
+    data.m_Variance         = &varianceTensor;
+    data.m_Beta             = &betaTensor;
+    data.m_Gamma            = &gammaTensor;
+    data.m_Parameters.m_Eps = 0.0f;
+
+    // for each channel:
+    // substract mean, divide by standard deviation (with an epsilon to avoid div by 0)
+    // multiply by gamma and add beta
+    std::unique_ptr<IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    OpenClTimer openClTimer;
+
+    BOOST_CHECK_EQUAL(openClTimer.GetName(), "OpenClKernelTimer");
+
+    //Start the timer
+    openClTimer.Start();
+
+    //Execute the workload
+    workload->Execute();
+
+    //Stop the timer
+    openClTimer.Stop();
+
+    BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().size(), 1);
+
+    BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().front().m_Name,
+                      "OpenClKernelTimer/0: batchnormalization_layer_nchw GWS[1,3,2]");
+
+    BOOST_CHECK(openClTimer.GetMeasurements().front().m_Value > 0);
+
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+#endif //aarch64 or x86_64
+\ No newline at end of file
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index da26fba76e..0c1a2619b2 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -7,6 +7,8 @@
 #include "armnn/ArmNN.hpp"
 #include "Graph.hpp"
 #include "Optimizer.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include "FloatingPointConverter.hpp"
 
 namespace
 {
@@ -21,7 +23,7 @@ bool CheckSequence(const armnn::Graph::ConstIterator first, const armnn::Graph::
     return (first == last);
 }
 
-/// Check each unary function in Us evaluates true for each correspondent layer in the sequence [first, last)
+/// Checks each unary function in Us evaluates true for each correspondent layer in the sequence [first, last).
 template <typename U, typename... Us>
 bool CheckSequence(const armnn::Graph::ConstIterator first,
                    const armnn::Graph::ConstIterator last,
@@ -30,11 +32,149 @@ bool CheckSequence(const armnn::Graph::ConstIterator first,
 {
     return u(*first) && CheckSequence(std::next(first), last, us...);
 }
+
+template <typename LayerT>
+bool CheckRelatedLayers(armnn::Graph& graph, const std::list<std::string>& testRelatedLayers)
+{
+    for (auto& layer : graph)
+    {
+        if (layer->GetType() == armnn::LayerEnumOf<LayerT>())
+        {
+            auto& relatedLayers = layer->GetRelatedLayerNames();
+            if(!std::equal(relatedLayers.begin(), relatedLayers.end(),
+                           testRelatedLayers.begin(), testRelatedLayers.end()))
+            {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+// connects two layers
+using namespace armnn;
+void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0)
+{
+    from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex));
+    from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo);
+}
+
+void CreateLSTMLayerHelper(Graph &graph, bool CifgEnabled)
+{
+    LstmDescriptor layerDesc;
+    layerDesc.m_ActivationFunc = 4;
+    layerDesc.m_ClippingThresCell = 0.2f;
+    layerDesc.m_ClippingThresProj = 0.4f;
+    layerDesc.m_CifgEnabled = CifgEnabled;
+    layerDesc.m_PeepholeEnabled = false;
+    layerDesc.m_ProjectionEnabled = false;
+
+    LstmLayer* const layer = graph.AddLayer<LstmLayer>(layerDesc, "layer");
+    unsigned int batchSize = 3;
+    unsigned int inputSize = 2;
+    unsigned int numUnits = 4;
+    unsigned int outputSize = 4;
+
+    layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+    layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+    layer->m_BasicParameters.m_CellBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+    layer->m_BasicParameters.m_OutputGateBias = std::make_unique<ScopedCpuTensorHandle>
+            (TensorInfo({ numUnits }, DataType::Float32));
+
+    layer->m_BasicParameters.m_InputToForgetWeights->Allocate();
+    layer->m_BasicParameters.m_InputToCellWeights->Allocate();
+    layer->m_BasicParameters.m_InputToOutputWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToForgetWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToCellWeights->Allocate();
+    layer->m_BasicParameters.m_RecurrentToOutputWeights->Allocate();
+    layer->m_BasicParameters.m_ForgetGateBias->Allocate();
+    layer->m_BasicParameters.m_CellBias->Allocate();
+    layer->m_BasicParameters.m_OutputGateBias->Allocate();
+
+    if (!layerDesc.m_CifgEnabled)
+    {
+        layer->m_CifgParameters.m_InputToInputWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits, inputSize }, DataType::Float32));
+        layer->m_CifgParameters.m_RecurrentToInputWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits, outputSize }, DataType::Float32));
+        layer->m_CifgParameters.m_CellToInputWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_CifgParameters.m_InputGateBias = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_CifgParameters.m_InputToInputWeights->Allocate();
+        layer->m_CifgParameters.m_RecurrentToInputWeights->Allocate();
+        layer->m_CifgParameters.m_CellToInputWeights->Allocate();
+        layer->m_CifgParameters.m_InputGateBias->Allocate();
+    }
+
+    if (layerDesc.m_ProjectionEnabled)
+    {
+        layer->m_ProjectionParameters.m_ProjectionWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ outputSize, numUnits }, DataType::Float32));
+        layer->m_ProjectionParameters.m_ProjectionBias = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ outputSize }, DataType::Float32));
+        layer->m_ProjectionParameters.m_ProjectionWeights->Allocate();
+        layer->m_ProjectionParameters.m_ProjectionBias->Allocate();
+    }
+
+    if (layerDesc.m_PeepholeEnabled)
+    {
+        layer->m_PeepholeParameters.m_CellToForgetWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_PeepholeParameters.m_CellToOutputWeights = std::make_unique<ScopedCpuTensorHandle>
+                (TensorInfo({ numUnits }, DataType::Float32));
+        layer->m_PeepholeParameters.m_CellToForgetWeights->Allocate();
+        layer->m_PeepholeParameters.m_CellToOutputWeights->Allocate();
+    }
+
+    // create input and output layers
+    Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+    Layer* const outputStateIn = graph.AddLayer<InputLayer>(1, "outputStateIn");
+    Layer* const cellStateIn = graph.AddLayer<InputLayer>(2, "cellStateIn");
+    Layer* const scratchBuffer = graph.AddLayer<OutputLayer>(0, "scratchBuffer");
+    Layer* const outputStateOut = graph.AddLayer<OutputLayer>(1, "outputStateOut");
+    Layer* const cellStateOut = graph.AddLayer<OutputLayer>(2, "cellStateOut");
+    Layer* const output = graph.AddLayer<OutputLayer>(3, "output");
+
+    // connect up
+    armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
+    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
+    if (layerDesc.m_CifgEnabled)
+    {
+        lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
+    }
+
+    Connect(input, layer, lstmTensorInfo1, 0, 0);
+    Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
+    Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2);
+    Connect(layer, scratchBuffer, lstmTensorInfoScratchBuff, 0, 0);
+    Connect(layer, outputStateOut, lstmTensorInfo3, 1, 0);
+    Connect(layer, cellStateOut, lstmTensorInfo2, 2, 0);
+    Connect(layer, output, lstmTensorInfo3, 3, 0);
+}
+
 }
 
 BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
 
-BOOST_AUTO_TEST_CASE(OptimizeInversePermutes)
+BOOST_AUTO_TEST_CASE(OptimizeInversePermutesTest)
 {
     armnn::Graph graph;
 
@@ -42,7 +182,7 @@ BOOST_AUTO_TEST_CASE(OptimizeInversePermutes)
 
     graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
 
-    // Insert two permutes, one the inverse of the other
+    // Inserts two permutes, one the inverse of the other.
     graph.InsertNewLayer<armnn::PermuteLayer>(output->GetInputSlot(0),
                                               armnn::PermuteDescriptor({0, 2, 3, 1}),
                                               "perm0231");
@@ -57,16 +197,38 @@ BOOST_AUTO_TEST_CASE(OptimizeInversePermutes)
                              &IsLayerOfType<armnn::PermuteLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
 
-    armnn::Optimizer::Optimize(graph);
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeInversePermutes()));
 
-    // The permutes are removed
+    // The permutes are removed.
     BOOST_TEST(CheckSequence(graph.cbegin(),
                              graph.cend(),
                              &IsLayerOfType<armnn::InputLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
 }
 
-BOOST_AUTO_TEST_CASE(MovePermuteUp)
+BOOST_AUTO_TEST_CASE(LSTMValidateTensorShapesFromInputsCIFGDisabledTest)
+{
+    Graph graph;
+
+    //Helper function creates graph containing LSTM layer with required input and output layers
+    CreateLSTMLayerHelper(graph, false);
+
+    //This function used to call ValidateShapesFromInputs();
+    BOOST_CHECK_NO_THROW(graph.InferTensorInfos());
+}
+
+BOOST_AUTO_TEST_CASE(LSTMValidateTensorShapesFromInputsCIFGEnabledTest)
+{
+    Graph graph;
+
+    //Helper function creates graph containing LSTM layer with required input and output layers
+    CreateLSTMLayerHelper(graph, true);
+
+    //This function used to call ValidateShapesFromInputs();
+    BOOST_CHECK_NO_THROW(graph.InferTensorInfos());
+}
+
+BOOST_AUTO_TEST_CASE(MovePermuteUpTest)
 {
     const armnn::TensorInfo info({ 1, 5, 2, 3 }, armnn::DataType::Float32);
     const armnn::TensorInfo permuted({ 1, 3, 5, 2 }, armnn::DataType::Float32);
@@ -77,12 +239,16 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp)
 
     armnn::Layer* head = graph.AddLayer<armnn::OutputLayer>(0, "output");
 
+    std::string permuteLayerName = "original_permute";
+
     // Insert permute
     head = graph.InsertNewLayer<armnn::PermuteLayer>(head->GetInputSlot(0),
-                                                     armnn::PermuteDescriptor({ 0, 2, 3, 1 }), "");
+                                                     armnn::PermuteDescriptor({ 0, 2, 3, 1 }),
+                                                     permuteLayerName.c_str());
+
     head->GetOutputHandler().SetTensorInfo(permuted);
 
-    // Insert layers that don't care about data format
+    // Inserts layers that don't care about data format.
     head = graph.InsertNewLayer<armnn::ActivationLayer>(head->GetInputSlot(0),
                                                         armnn::ActivationDescriptor{}, "");
     head->GetOutputHandler().SetTensorInfo(info);
@@ -90,7 +256,7 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp)
     head = graph.InsertNewLayer<armnn::AdditionLayer>(head->GetInputSlot(0), "");
     head->GetOutputHandler().SetTensorInfo(info);
 
-    // Insert input for 2nd input of Addition
+    // Inserts input for 2nd input of Addition.
     graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "")
         ->GetOutputHandler().SetTensorInfo(info);
 
@@ -107,11 +273,11 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp)
     head = graph.InsertNewLayer<armnn::MultiplicationLayer>(head->GetInputSlot(0), "");
     head->GetOutputHandler().SetTensorInfo(info);
 
-    // Insert input for 2nd input of Multiplication
+    // Inserts input for 2nd input of Multiplication.
     graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "")
         ->GetOutputHandler().SetTensorInfo(info);
 
-    // Insert input
+    // Inserts input.
     graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(0), inputId++, "")
         ->GetOutputHandler().SetTensorInfo(info);
 
@@ -129,9 +295,9 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp)
                              &IsLayerOfType<armnn::PermuteLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
 
-    armnn::Optimizer::Optimize(graph);
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(MovePermuteUp()));
 
-    // The permute is moved to the top. New permutes for layers with multiple inputs
+    // The permute is moved to the top. New permutes for layers with multiple inputs.
     BOOST_TEST(CheckSequence(graph.cbegin(),
                              graph.cend(),
                              &IsLayerOfType<armnn::InputLayer>,
@@ -147,12 +313,18 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp)
                              &IsLayerOfType<armnn::AdditionLayer>,
                              &IsLayerOfType<armnn::ActivationLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
+
+    std::list<std::string> testRelatedLayers = { permuteLayerName };
+
+    BOOST_TEST(CheckRelatedLayers<armnn::PermuteLayer>(graph, testRelatedLayers));
 }
 
-BOOST_AUTO_TEST_CASE(PermuteAsReshape)
+BOOST_AUTO_TEST_CASE(PermuteAsReshapeTest)
 {
     armnn::Graph graph;
 
+    std::string permuteLayerName = "permute";
+
     const armnn::TensorInfo infoIn({ 1, 2, 3, 1 }, armnn::DataType::Float32);
     const armnn::TensorInfo infoOut({ 1, 1, 2, 3 }, armnn::DataType::Float32);
 
@@ -161,9 +333,9 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape)
     graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input")
         ->GetOutputHandler().SetTensorInfo(infoIn);
 
-    // Insert permute
+    // Inserts permute.
     graph.InsertNewLayer<armnn::PermuteLayer>(output->GetInputSlot(0),
-                                              armnn::PermuteDescriptor({ 0, 2, 3, 1 }), "")
+                                              armnn::PermuteDescriptor({ 0, 2, 3, 1 }), permuteLayerName.c_str())
         ->GetOutputHandler().SetTensorInfo(infoOut);
 
     BOOST_TEST(CheckSequence(graph.cbegin(),
@@ -172,7 +344,7 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape)
                              &IsLayerOfType<armnn::PermuteLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
 
-    armnn::Optimizer::Optimize(graph);
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(PermuteAsReshape()));
 
     // The permute is replaced by an equivalent reshape.
 
@@ -189,9 +361,13 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape)
                              &IsLayerOfType<armnn::InputLayer>,
                              checkReshape,
                              &IsLayerOfType<armnn::OutputLayer>));
+
+
+    std::list<std::string> testRelatedLayers = { permuteLayerName };
+    BOOST_TEST(CheckRelatedLayers<armnn::ReshapeLayer>(graph, testRelatedLayers));
 }
 
-BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes)
+BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapesTest)
 {
     armnn::Graph graph;
 
@@ -203,16 +379,19 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes)
     input->GetOutputHandler().SetTensorInfo(info0);
 
     {
-        // Insert two reshapes
+        // Inserts two reshapes.
         const armnn::TensorInfo info1({1, 30, 1, 1}, armnn::DataType::Float32);
         const armnn::TensorInfo info2({1, 2, 1, 15}, armnn::DataType::Float32);
 
+        std::string reshape1Name = "reshape1";
+        std::string reshape2Name = "reshape2";
+
         auto reshape1 = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0),
                                                                   armnn::ReshapeDescriptor{ info1.GetShape() },
-                                                                  "reshape1");
+                                                                  reshape1Name.c_str());
         auto reshape2 = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0),
                                                                   armnn::ReshapeDescriptor{ info2.GetShape() },
-                                                                  "reshape2");
+                                                                  reshape2Name.c_str());
 
         reshape1->GetOutputHandler().SetTensorInfo(info1);
         reshape2->GetOutputHandler().SetTensorInfo(info2);
@@ -224,7 +403,7 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes)
                                  &IsLayerOfType<armnn::ReshapeLayer>,
                                  &IsLayerOfType<armnn::OutputLayer>));
 
-        armnn::Optimizer::Optimize(graph);
+        armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeConsecutiveReshapes()));
 
         auto checkReshape = [&info2](const armnn::Layer* const layer) -> bool
             {
@@ -234,25 +413,30 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes)
                     (reshapeLayer->GetOutputHandler().GetTensorInfo().GetShape() == info2.GetShape());
             };
 
-        // The two reshapes are replaced by a single equivalent reshape
+        // The two reshapes are replaced by a single equivalent reshape.
         BOOST_TEST(CheckSequence(graph.cbegin(),
                                  graph.cend(),
                                  &IsLayerOfType<armnn::InputLayer>,
                                  checkReshape,
                                  &IsLayerOfType<armnn::OutputLayer>));
+
+        // Check the new reshape layer has the other two reshapes as related layers
+        std::list<std::string> testRelatedLayers = { reshape2Name, reshape1Name };
+
+        BOOST_TEST(CheckRelatedLayers<armnn::ReshapeLayer>(graph, testRelatedLayers));
     }
 
     {
-        // Insert a reshape to the input shape
+        // Inserts a reshape to the input shape.
         auto reshapeToIn = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0),
                                                                      armnn::ReshapeDescriptor{ info0.GetShape() },
                                                                      "reshapeToIn");
 
         reshapeToIn->GetOutputHandler().SetTensorInfo(info0);
 
-        armnn::Optimizer::Optimize(graph);
+        armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeConsecutiveReshapes()));
 
-        // The two reshapes are removed
+        // The two reshapes are removed.
         BOOST_TEST(CheckSequence(graph.cbegin(),
                                  graph.cend(),
                                  &IsLayerOfType<armnn::InputLayer>,
@@ -260,7 +444,7 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes)
     }
 }
 
-BOOST_AUTO_TEST_CASE(SquashEqualSiblings)
+BOOST_AUTO_TEST_CASE(SquashEqualSiblingsTest)
 {
     armnn::Graph graph;
 
@@ -272,7 +456,7 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings)
     auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
     input->GetOutputSlot().SetTensorInfo(info);
 
-    // Insert equal permutes, equal reshapes and something else
+    // Inserts equal permutes, equal reshapes and something else.
     const armnn::PermuteDescriptor permDesc({ 0, 2, 3, 1 });
     const armnn::ReshapeDescriptor reshapeDesc{ { 1, 3, 1, 5 } };
 
@@ -314,7 +498,8 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings)
                              &IsLayerOfType<armnn::OutputLayer>,
                              &IsLayerOfType<armnn::OutputLayer>));
 
-    armnn::Optimizer::Optimize(graph);
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(SquashEqualPermuteSiblings(),
+                                                            SquashEqualReshapeSiblings()));
 
     // The permutes and reshapes are squashed.
 
@@ -331,4 +516,259 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings)
                              &IsLayerOfType<armnn::OutputLayer>));
 }
 
+BOOST_AUTO_TEST_CASE(ConvertConstantsHalfToFloatTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo info({ 1,1,1,2 }, armnn::DataType::Float32);
+
+    // Create the half precision input data
+    unsigned int dims[] = { 4,1,1,1 };
+    std::vector<float> convWeightsData{1.f, 2.f, 3.f, 4.f};
+    std::vector<uint16_t> halfWeights(4);
+    armnnUtils::FloatingPointConverter::ConvertFloat32To16(convWeightsData.data(),
+                                                           convWeightsData.size(),
+                                                           halfWeights.data());
+    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float16), halfWeights);
+
+    //Create the simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(info);
+
+    auto fc = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc");
+    fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights);
+    fc->GetOutputSlot().SetTensorInfo(info);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    //Connect up the layers
+    input->GetOutputSlot().Connect(fc->GetInputSlot(0));
+    fc->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    //Test the tensor info is correct.
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsHalfToFloat()));
+
+    //Test the tensor info is correct.
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32);
+
+    // Now test the data matches float32 data
+    float* data = fc->m_Weight->GetTensor<float>();
+    BOOST_CHECK(1.0f == data[0]);
+    BOOST_CHECK(2.0f == data[1]);
+    BOOST_CHECK(3.0f == data[2]);
+    BOOST_CHECK(4.0f == data[3]);
+}
+
+BOOST_AUTO_TEST_CASE(ConvertConstantsFloatToHalfTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo info({ 1, 1, 1, 2 }, armnn::DataType::Float16);
+
+    // Create const tensor from fp32 data
+    unsigned int dims[] = { 4, 1, 1, 1 };
+    std::vector<float> floatWeights{ 1.0f, 2.0f, 3.0f, 4.0f };
+    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), floatWeights);
+
+    // Create simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(info);
+
+    auto fc = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc");
+    fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights);
+    fc->GetOutputSlot().SetTensorInfo(info);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    // Connect up the layers
+    input->GetOutputSlot().Connect(fc->GetInputSlot(0));
+    fc->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    // Check tensor data type before conversion
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32);
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsFloatToHalf()));
+
+    // Check tensor data type after conversion
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+
+    // Check whether data matches expected fp16 data
+    Half* data = fc->m_Weight->GetTensor<Half>();
+    BOOST_CHECK(data[0] == Half(1.0f));
+    BOOST_CHECK(data[1] == Half(2.0f));
+    BOOST_CHECK(data[2] == Half(3.0f));
+    BOOST_CHECK(data[3] == Half(4.0f));
+}
+
+BOOST_AUTO_TEST_CASE(OptimizeInverseConversionsTest)
+{
+    armnn::Graph graph;
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    // Fp32ToFp16 conversion followed by an inverse Fp16ToFp32 conversion
+    graph.InsertNewLayer<armnn::ConvertFp32ToFp16Layer>(output->GetInputSlot(0), "convert1");
+    graph.InsertNewLayer<armnn::ConvertFp16ToFp32Layer>(output->GetInputSlot(0), "convert2");
+
+    graph.InsertNewLayer<armnn::Convolution2dLayer>(output->GetInputSlot(0), Convolution2dDescriptor(), "conv");
+
+    // Fp16ToFp32 conversion followed by an inverse Fp32ToFp16 conversion
+    graph.InsertNewLayer<armnn::ConvertFp16ToFp32Layer>(output->GetInputSlot(0), "convert3");
+    graph.InsertNewLayer<armnn::ConvertFp32ToFp16Layer>(output->GetInputSlot(0), "convert4");
+
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::Convolution2dLayer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeInverseConversionsFp16(),
+                                                           OptimizeInverseConversionsFp32()));
+
+    // Check that all consecutive inverse conversions are removed
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::Convolution2dLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+}
+
+BOOST_AUTO_TEST_CASE(InsertConvertersTest)
+{
+    const armnn::TensorInfo info({ 1, 5, 2, 3 }, armnn::DataType::Float16);
+
+    armnn::Graph graph;
+
+    armnn::LayerBindingId inputId = 0;
+
+    armnn::Layer* head = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    head = graph.InsertNewLayer<armnn::AdditionLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "")
+        ->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::FloorLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::MemCopyLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(0), inputId++, "")
+        ->GetOutputHandler().SetTensorInfo(info);
+
+    // Check graph layer sequence before inserting convert layers
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::MemCopyLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::AdditionLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    // Check layers have Float16 DataType
+    for (auto& layer : graph)
+    {
+        if(layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition)
+        {
+            BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float16);
+            BOOST_ASSERT(layer->GetDataType() == DataType::Float16);
+        }
+    }
+
+    // Insert convert layers either side of unsupported layer
+    for (auto& layer : graph)
+    {
+        if(layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition)
+        {
+            InsertConvertFp16ToFp32LayersBefore(graph, *layer);
+            InsertConvertFp32ToFp16LayersAfter(graph, *layer);
+        }
+    }
+
+    // Check layers have correct DataType after inserting convert layers
+    for (auto& layer : graph)
+    {
+        if (layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition)
+        {
+            BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float32);
+            BOOST_ASSERT(layer->GetDataType() == DataType::Float32);
+        }
+        else if (layer->GetType() == LayerType::ConvertFp16ToFp32)
+        {
+            BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float32);
+            BOOST_ASSERT(layer->GetDataType() == DataType::Float16);
+        }
+        else if (layer->GetType() == LayerType::ConvertFp32ToFp16)
+        {
+            BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float16);
+            BOOST_ASSERT(layer->GetDataType() == DataType::Float32);
+        }
+    }
+
+    // Check sequence of layers after inserting convert layers
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::MemCopyLayer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::AdditionLayer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+}
+
+BOOST_AUTO_TEST_CASE(Fp32NetworkToFp16OptimizationTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo infoFP32({ 2,2,1,3 }, armnn::DataType::Float32);
+
+    // Create the simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(infoFP32);
+
+    auto floor = graph.AddLayer<armnn::FloorLayer>("floor");
+    floor->GetOutputSlot().SetTensorInfo(infoFP32);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    // Connect up the layers
+    input->GetOutputSlot().Connect(floor->GetInputSlot(0));
+    floor->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(Fp32NetworkToFp16Converter()));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/ProfilerTests.cpp b/src/armnn/test/ProfilerTests.cpp
new file mode 100644
index 0000000000..4450c5a08e
--- /dev/null
+++ b/src/armnn/test/ProfilerTests.cpp
@@ -0,0 +1,235 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/output_test_stream.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <memory>
+#include <thread>
+
+#include <armnn/TypesUtils.hpp>
+#include <Profiling.hpp>
+
+namespace armnn
+{
+
+size_t GetProfilerEventSequenceSize(armnn::Profiler* profiler)
+{
+    if (!profiler)
+    {
+        return static_cast<size_t>(-1);
+    }
+
+    return profiler->m_EventSequence.size();
+}
+} // namespace armnn
+
+namespace
+{
+
+void RegisterUnregisterProfilerSingleThreadImpl()
+{
+    // Important! Regular assertions must be used in this function for testing (rather than
+    // BOOST_TEST macros) otherwise multi-threading tests would randomly fail.
+
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+
+    // Check that there's no profiler registered for this thread.
+    assert(!profilerManager.GetProfiler());
+
+    // Create and register a profiler for this thread.
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+    profilerManager.RegisterProfiler(profiler.get());
+
+    // Check that on a single thread we get the same profiler we registered.
+    assert(profiler.get() == profilerManager.GetProfiler());
+
+    // Destroy the profiler.
+    profiler.reset();
+
+    // Check that the profiler has been un-registered for this thread.
+    assert(!profilerManager.GetProfiler());
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_SUITE(Profiler)
+
+BOOST_AUTO_TEST_CASE(EnableDisableProfiling)
+{
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+
+    // Check that profiling is disabled by default.
+    BOOST_TEST(!profiler->IsProfilingEnabled());
+
+    // Enable profiling.
+    profiler->EnableProfiling(true);
+
+    // Check that profiling is enabled.
+    BOOST_TEST(profiler->IsProfilingEnabled());
+
+    // Disable profiling.
+    profiler->EnableProfiling(false);
+
+    // Check that profiling is disabled.
+    BOOST_TEST(!profiler->IsProfilingEnabled());
+}
+
+BOOST_AUTO_TEST_CASE(RegisterUnregisterProfilerSingleThread)
+{
+    RegisterUnregisterProfilerSingleThreadImpl();
+}
+
+BOOST_AUTO_TEST_CASE(RegisterUnregisterProfilerMultipleThreads)
+{
+    std::thread thread1([]() { RegisterUnregisterProfilerSingleThreadImpl(); });
+    std::thread thread2([]() { RegisterUnregisterProfilerSingleThreadImpl(); });
+    std::thread thread3([]() { RegisterUnregisterProfilerSingleThreadImpl(); });
+
+    thread1.join();
+    thread2.join();
+    thread3.join();
+}
+
+BOOST_AUTO_TEST_CASE(ProfilingMacros)
+{
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+
+    { // --- No profiler ---
+
+        // Check that there's no profiler registered for this thread.
+        BOOST_TEST(!profilerManager.GetProfiler());
+
+        // Test scoped event.
+        { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); }
+
+        // Check that we still cannot get a profiler for this thread.
+        BOOST_TEST(!profilerManager.GetProfiler());
+    }
+
+    // Create and register a profiler for this thread.
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+    profilerManager.RegisterProfiler(profiler.get());
+
+    { // --- Profiler, but profiling disabled ---
+
+        // Get current event sequence size.
+        size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get());
+
+        // Test scoped macro.
+        { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); }
+
+        // Check that no profiling event has been added to the sequence.
+        size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get());
+        BOOST_TEST(eventSequenceSizeBefore == eventSequenceSizeAfter);
+    }
+
+    // Enable profiling.
+    profiler->EnableProfiling(true);
+
+    { // --- Profiler, and profiling enabled ---
+
+        // Get current event sequence size.
+        size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get());
+
+        // Test scoped macro.
+        { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); }
+
+        // Check that a profiling event has been added to the sequence.
+        size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get());
+        BOOST_TEST(eventSequenceSizeAfter == eventSequenceSizeBefore + 1);
+    }
+
+    // Disable profiling here to not print out anything on stdout.
+    profiler->EnableProfiling(false);
+}
+
+BOOST_AUTO_TEST_CASE(RuntimeLoadNetwork)
+{
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+
+    // Check that there's no profiler registered for this thread.
+    BOOST_TEST(!profilerManager.GetProfiler());
+
+    // Build a mock-network and load it into the runtime.
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+    armnn::NetworkId networkIdentifier = 1;
+    armnn::INetworkPtr mockNetwork(armnn::INetwork::Create());
+    mockNetwork->AddInputLayer(0, "test layer");
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef };
+    runtime->LoadNetwork(networkIdentifier, armnn::Optimize(*mockNetwork, backends, runtime->GetDeviceSpec()));
+
+    // Check that now there's a profiler registered for this thread (created and registered by the loading the network).
+    BOOST_TEST(profilerManager.GetProfiler());
+
+    // Unload the network.
+    runtime->UnloadNetwork(networkIdentifier);
+
+    // Check that the profiler has been un-registered for this thread.
+    BOOST_TEST(!profilerManager.GetProfiler());
+}
+
+BOOST_AUTO_TEST_CASE(WriteEventResults)
+{
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance();
+
+    // Create and register a profiler for this thread.
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+    profileManager.RegisterProfiler(profiler.get());
+
+    // Enable profiling.
+    profiler->EnableProfiling(true);
+
+    { // --- Profiler, and profiling enabled ---
+
+        // Get current event sequence size.
+        size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get());
+
+        // Test scoped macro.
+        {
+            // Need to directly create a ScopedProfilingEvent as the one created by the macro falls out of scope
+            // immediately causing the Event.Stop() function method to be called immediately after the Event.Start()
+            // function resulting in periodic test failures on the Dent and Smith HiKeys
+            armnn::ScopedProfilingEvent testEvent(armnn::Compute::CpuAcc, "test", armnn::WallClockTimer());
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        }
+
+        // Check that a profiling event has been added to the sequence.
+        size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get());
+        BOOST_TEST(eventSequenceSizeAfter == eventSequenceSizeBefore + 1);
+
+        boost::test_tools::output_test_stream output;
+        profiler->AnalyzeEventsAndWriteResults(output);
+        BOOST_TEST(!output.is_empty(false));
+
+        // output should contain event name 'test'
+        BOOST_CHECK(boost::contains(output.str(), "test"));
+
+        // output should contain headers
+        BOOST_CHECK(boost::contains(output.str(), "Event Sequence - Name"));
+        BOOST_CHECK(boost::contains(output.str(), "Event Stats - Name"));
+        BOOST_CHECK(boost::contains(output.str(), "Total"));
+        BOOST_CHECK(boost::contains(output.str(), "Device"));
+        // output should contain compute device 'CpuAcc'
+        BOOST_CHECK(boost::contains(output.str(), "CpuAcc"));
+        // output should not contain un-readable numbers
+        BOOST_CHECK(!(boost::contains(output.str(), "e+")));
+        // output should not contain un-readable numbers
+        BOOST_CHECK(!(boost::contains(output.str(), "+")));
+        // output should not contain zero value
+        BOOST_CHECK(!(boost::contains(output.str(), " 0 ")));
+    }
+
+    // Disable profiling here to not print out anything on stdout.
+    profiler->EnableProfiling(false);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/ProfilingEventTest.cpp b/src/armnn/test/ProfilingEventTest.cpp
new file mode 100644
index 0000000000..4d0319d456
--- /dev/null
+++ b/src/armnn/test/ProfilingEventTest.cpp
@@ -0,0 +1,95 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include <boost/test/unit_test.hpp>
+
+#include "ProfilingEvent.hpp"
+#include "Profiling.hpp"
+#include <thread>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(ProfilingEvent)
+
+BOOST_AUTO_TEST_CASE(ProfilingEventTest)
+{
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance();
+
+    const char* eventName = "EventName";
+
+    Event::Instruments insts1;
+    insts1.emplace_back(std::make_unique<WallClockTimer>());
+    Event testEvent(eventName,
+                    nullptr,
+                    nullptr,
+                    armnn::Compute::Undefined,
+                    std::move(insts1));
+
+    BOOST_CHECK_EQUAL(testEvent.GetName(), "EventName");
+
+    // start the timer - outer
+    testEvent.Start();
+
+    // wait for 10 milliseconds
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+    // stop the timer - outer
+    testEvent.Stop();
+
+    BOOST_CHECK_GE(testEvent.GetMeasurements().front().m_Value, 10.0);
+
+    // create a sub event with CpuAcc
+    Event::Instruments insts2;
+    insts2.emplace_back(std::make_unique<WallClockTimer>());
+    Event testEvent2(eventName,
+                     profileManager.GetProfiler(),
+                     &testEvent,
+                     Compute::CpuAcc,
+                     std::move(insts2));
+
+    BOOST_CHECK_EQUAL(&testEvent, testEvent2.GetParentEvent());
+    BOOST_CHECK_EQUAL(profileManager.GetProfiler(), testEvent2.GetProfiler());
+    BOOST_CHECK_EQUAL(Compute::CpuAcc, testEvent2.GetComputeDevice());
+}
+
+BOOST_AUTO_TEST_CASE(ProfilingEventTestOnGpuAcc)
+{
+    // Get a reference to the profiler manager.
+    armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance();
+
+    const char* eventName = "GPUEvent";
+
+    Event::Instruments insts1;
+    insts1.emplace_back(std::make_unique<WallClockTimer>());
+    Event testEvent(eventName,
+                    nullptr,
+                    nullptr,
+                    armnn::Compute::Undefined,
+                    std::move(insts1));
+
+    BOOST_CHECK_EQUAL(testEvent.GetName(), "GPUEvent");
+
+    // start the timer - outer
+    testEvent.Start();
+
+    // wait for 10 milliseconds
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+    // stop the timer - outer
+    testEvent.Stop();
+
+    BOOST_CHECK_GE(testEvent.GetMeasurements().front().m_Value, 10.0);
+
+    // create a sub event
+    Event::Instruments insts2;
+    insts2.emplace_back(std::make_unique<WallClockTimer>());
+    Event testEvent2(eventName, profileManager.GetProfiler(), &testEvent, Compute::GpuAcc, std::move(insts2));
+
+    BOOST_CHECK_EQUAL(&testEvent, testEvent2.GetParentEvent());
+    BOOST_CHECK_EQUAL(profileManager.GetProfiler(), testEvent2.GetProfiler());
+    BOOST_CHECK_EQUAL(Compute::GpuAcc, testEvent2.GetComputeDevice());
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/RuntimeTests.cpp b/src/armnn/test/RuntimeTests.cpp
index fcb0a1e7c2..e29a1d4841 100644
--- a/src/armnn/test/RuntimeTests.cpp
+++ b/src/armnn/test/RuntimeTests.cpp
@@ -32,33 +32,46 @@ BOOST_AUTO_TEST_SUITE(Runtime)
 BOOST_AUTO_TEST_CASE(RuntimeUnloadNetwork)
 {
     // build 2 mock-networks and load them into the runtime
-    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef));
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
 
-    // mock network 1
+    // Mock network 1.
     armnn::NetworkId networkIdentifier1 = 1;
     armnn::INetworkPtr mockNetwork1(armnn::INetwork::Create());
     mockNetwork1->AddInputLayer(0, "test layer");
-    runtime->LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, runtime->GetDeviceSpec()));
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+    runtime->LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, backends, runtime->GetDeviceSpec()));
 
-    // mock network 2
+    // Mock network 2.
     armnn::NetworkId networkIdentifier2 = 2;
     armnn::INetworkPtr mockNetwork2(armnn::INetwork::Create());
     mockNetwork2->AddInputLayer(0, "test layer");
-    runtime->LoadNetwork(networkIdentifier2, Optimize(*mockNetwork2, runtime->GetDeviceSpec()));
+    runtime->LoadNetwork(networkIdentifier2, Optimize(*mockNetwork2, backends, runtime->GetDeviceSpec()));
 
-    // unload one by its networkID
+    // Unloads one by its networkID.
     BOOST_TEST(runtime->UnloadNetwork(networkIdentifier1) == armnn::Status::Success);
 
     BOOST_TEST(runtime->UnloadNetwork(networkIdentifier1) == armnn::Status::Failure);
 }
 
 // Note: the current builds we don't do valgrind and gperftools based leak checking at the same
-//       time, so in practice WITH_VALGRIND and ARMNN_LEAK_CHECKING_ENABLED are exclusive. In
-//       the future the gperftools based leak checking should stay and the valgrind based should
-//       be removed.
+//       time, so in practice WITH_VALGRIND and ARMNN_LEAK_CHECKING_ENABLED are exclusive. The
+//       valgrind tests can stay for x86 builds, but on hikey Valgrind is just way too slow
+//       to be integrated into the CI system.
 
-#if ARMNN_LEAK_CHECKING_ENABLED
-void CreateAndDropDummyNetwork(armnn::Runtime & runtime)
+#ifdef ARMNN_LEAK_CHECKING_ENABLED
+
+struct DisableGlobalLeakChecking
+{
+    DisableGlobalLeakChecking()
+    {
+        ARMNN_LOCAL_LEAK_CHECKING_ONLY();
+    }
+};
+
+BOOST_GLOBAL_FIXTURE(DisableGlobalLeakChecking);
+
+void CreateAndDropDummyNetwork(const std::vector<armnn::Compute>& backends, armnn::Runtime& runtime)
 {
     armnn::NetworkId networkIdentifier;
     {
@@ -74,12 +87,12 @@ void CreateAndDropDummyNetwork(armnn::Runtime & runtime)
         input->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
         layer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-        // set the tensors in the network
+        // Sets the tensors in the network.
         input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
         layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
         // optimize the network
-        armnn::IOptimizedNetworkPtr optNet = Optimize(*network, runtime.GetDeviceSpec());
+        armnn::IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime.GetDeviceSpec());
 
         runtime.LoadNetwork(networkIdentifier, std::move(optNet));
     }
@@ -94,10 +107,13 @@ BOOST_AUTO_TEST_CASE(RuntimeHeapMemoryUsageSanityChecks)
         ARMNN_SCOPED_LEAK_CHECKER("Sanity_Check_Outer");
         {
             ARMNN_SCOPED_LEAK_CHECKER("Sanity_Check_Inner");
+            BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE() == true);
             std::unique_ptr<char[]> dummyAllocation(new char[1000]);
-            BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE() == false);
-            BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() >= 1000);
-            BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() >= 1);
+            BOOST_CHECK_MESSAGE(ARMNN_NO_LEAKS_IN_SCOPE() == false,
+                "A leak of 1000 bytes is expected here. "
+                "Please make sure environment variable: HEAPCHECK=draconian is set!");
+            BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 1000);
+            BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 1);
         }
         BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0);
@@ -109,22 +125,24 @@ BOOST_AUTO_TEST_CASE(RuntimeHeapMemoryUsageSanityChecks)
 BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksGpuAcc)
 {
     BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE());
-
-    armnn::Runtime runtime(armnn::Compute::GpuAcc);
+    armnn::IRuntime::CreationOptions options;
+    armnn::Runtime runtime(options);
     armnn::RuntimeLoadedNetworksReserve(&runtime);
 
+    std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc};
     {
         // Do a warmup of this so we make sure that all one-time
         // initialization happens before we do the leak checking.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
     }
 
     {
         ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkGpuAcc");
+        BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         // In the second run we check for all remaining memory
         // in use after the network was unloaded. If there is any
         // then it will be treated as a memory leak.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
         BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0);
         BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0);
@@ -136,22 +154,24 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksGpuAcc)
 BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksCpuAcc)
 {
     BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE());
-
-    armnn::Runtime runtime(armnn::Compute::CpuAcc);
+    armnn::IRuntime::CreationOptions options;
+    armnn::Runtime runtime(options);
     armnn::RuntimeLoadedNetworksReserve(&runtime);
 
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuAcc};
     {
         // Do a warmup of this so we make sure that all one-time
         // initialization happens before we do the leak checking.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
     }
 
     {
         ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkCpuAcc");
+        BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         // In the second run we check for all remaining memory
         // in use after the network was unloaded. If there is any
         // then it will be treated as a memory leak.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
         BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0);
         BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0);
@@ -163,21 +183,24 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksCpuRef)
 {
     BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE());
 
-    armnn::Runtime runtime(armnn::Compute::CpuRef);
+    armnn::IRuntime::CreationOptions options;
+    armnn::Runtime runtime(options);
     armnn::RuntimeLoadedNetworksReserve(&runtime);
 
+    std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
     {
         // Do a warmup of this so we make sure that all one-time
         // initialization happens before we do the leak checking.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
     }
 
     {
         ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkCpuRef");
+        BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         // In the second run we check for all remaining memory
         // in use after the network was unloaded. If there is any
         // then it will be treated as a memory leak.
-        CreateAndDropDummyNetwork(runtime);
+        CreateAndDropDummyNetwork(backends, runtime);
         BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE());
         BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0);
         BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0);
@@ -199,25 +222,28 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage)
 
     // A start-pointer or chain of start-pointers to the block is found. Since the block is still pointed at,
     // the programmer could, at least in principle, have freed it before program exit.
-    // We want to test this in case memory is not freed as early as it could have been
+    // We want to test this in case memory is not freed as early as it could have been.
     unsigned long reachableBefore = 0;
     unsigned long reachableAfter = 0;
 
-    // needed as out params but we don't test them
+    // Needed as out params but we don't test them.
     unsigned long dubious = 0;
     unsigned long suppressed = 0;
 
-    // ensure that runtime is large enough before checking for memory leaks
-    // otherwise when loading the network it will automatically reserve memory that won't be released until destruction
+    // Ensure that runtime is large enough before checking for memory leaks.
+    // Otherwise, when loading the network, it will automatically reserve memory that won't be released
+    // until destruction.
     armnn::NetworkId networkIdentifier;
-    armnn::Runtime runtime(armnn::Compute::GpuAcc);
+    armnn::IRuntime::CreationOptions options;
+    armnn::Runtime runtime(options);
     armnn::RuntimeLoadedNetworksReserve(&runtime);
 
-    // check for leaks before we load the network and record them so that we can see the delta after unloading
+    // Checks for leaks before we load the network and record them so that we can see the delta after unloading.
     VALGRIND_DO_QUICK_LEAK_CHECK;
     VALGRIND_COUNT_LEAKS(leakedBefore, dubious, reachableBefore, suppressed);
 
     // build a mock-network and load it into the runtime
+    std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc};
     {
         armnn::TensorInfo inputTensorInfo(armnn::TensorShape({ 7, 7 }), armnn::DataType::Float32);
         armnn::TensorInfo outputTensorInfo(armnn::TensorShape({ 7, 7 }), armnn::DataType::Float32);
@@ -231,12 +257,12 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage)
         input->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
         layer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-        // set the tensors in the network
+        // Sets the tensors in the network.
         input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
         layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
         // optimize the network
-        armnn::IOptimizedNetworkPtr optNet = Optimize(*mockNetwork, runtime.GetDeviceSpec());
+        armnn::IOptimizedNetworkPtr optNet = Optimize(*mockNetwork, backends, runtime.GetDeviceSpec());
 
         runtime.LoadNetwork(networkIdentifier, std::move(optNet));
     }
@@ -246,16 +272,16 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage)
     VALGRIND_DO_ADDED_LEAK_CHECK;
     VALGRIND_COUNT_LEAKS(leakedAfter, dubious, reachableAfter, suppressed);
 
-    // if we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass
+    // If we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass.
     BOOST_TEST(leakedBefore == leakedAfter);
 
     // Add resonable threshold after and before running valgrind with the ACL clear cache function.
     // TODO Threshold set to 80k until the root cause of the memory leakage is found and fixed. Revert threshold
-    // value to 1024 when fixed
+    // value to 1024 when fixed.
     BOOST_TEST(static_cast<long>(reachableAfter) - static_cast<long>(reachableBefore) < 81920);
 
-    // these are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters
-    // so they are assigned to, but still considered unused, causing a warning
+    // These are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters
+    // so they are assigned to, but still considered unused, causing a warning.
     boost::ignore_unused(dubious);
     boost::ignore_unused(suppressed);
 }
@@ -263,7 +289,7 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage)
 
 // Note: this part of the code is due to be removed when we fully trust the gperftools based results.
 #ifdef WITH_VALGRIND
-// run with the following command to get all the amazing output (in the devenv/build folder) :)
+// Run with the following command to get all the amazing output (in the devenv/build folder) :)
 // valgrind --leak-check=full --show-leak-kinds=all --log-file=Valgrind_Memcheck_Leak_Report.txt armnn/test/UnitTests
 BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
 {
@@ -276,11 +302,11 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
 
     // A start-pointer or chain of start-pointers to the block is found. Since the block is still pointed at,
     // the programmer could, at least in principle, have freed it before program exit.
-    // We want to test this in case memory is not freed as early as it could have been
+    // We want to test this in case memory is not freed as early as it could have been.
     unsigned long reachableBefore = 0;
     unsigned long reachableAfter = 0;
 
-    // needed as out params but we don't test them
+    // Needed as out params but we don't test them.
     unsigned long dubious = 0;
     unsigned long suppressed = 0;
 
@@ -288,14 +314,15 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
 
     // ensure that runtime is large enough before checking for memory leaks
     // otherwise when loading the network it will automatically reserve memory that won't be released until destruction
-    armnn::Runtime runtime(armnn::Compute::CpuRef);
+    armnn::IRuntime::CreationOptions options;
+    armnn::Runtime runtime(options);
     armnn::RuntimeLoadedNetworksReserve(&runtime);
 
-    // check for leaks before we load the network and record them so that we can see the delta after unloading
+    // Checks for leaks before we load the network and record them so that we can see the delta after unloading.
     VALGRIND_DO_QUICK_LEAK_CHECK;
     VALGRIND_COUNT_LEAKS(leakedBefore, dubious, reachableBefore, suppressed);
 
-    // build a mock-network and load it into the runtime
+    // Builds a mock-network and load it into the runtime.
     {
         unsigned int inputShape[] = {1, 7, 1, 1};
         armnn::TensorInfo inputTensorInfo(4, inputShape, armnn::DataType::Float32);
@@ -303,10 +330,9 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
         std::unique_ptr<armnn::Network> mockNetwork1 = std::make_unique<armnn::Network>();
         mockNetwork1->AddInputLayer(0, "test layer");
 
-        armnn::DeviceSpec device;
-        device.DefaultComputeDevice = armnn::Compute::CpuRef;
 
-        runtime.LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, device));
+        std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef};
+        runtime.LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, backends, runtime.GetDeviceSpec()));
     }
 
     runtime.UnloadNetwork(networkIdentifier1);
@@ -314,7 +340,7 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
     VALGRIND_DO_ADDED_LEAK_CHECK;
     VALGRIND_COUNT_LEAKS(leakedAfter, dubious, reachableAfter, suppressed);
 
-    // if we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass
+    // If we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass.
     BOOST_TEST(leakedBefore == leakedAfter);
 
     #if defined(ARMCOMPUTECL_ENABLED)
@@ -329,11 +355,134 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak)
 
     BOOST_TEST(reachableBefore >= reachableAfter);
 
-    // these are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters
-    // so they are assigned to, but still considered unused, causing a warning
+    // These are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters
+    // so they are assigned to, but still considered unused, causing a warning.
     boost::ignore_unused(dubious);
     boost::ignore_unused(suppressed);
 }
 #endif
 
+#if ARMCOMPUTENEON_ENABLED
+BOOST_AUTO_TEST_CASE(RuntimeValidateCpuAccDeviceSupportLayerNoFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(optNet);
+
+    // Load it into the runtime. It should success.
+    armnn::NetworkId netId;
+    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == armnn::Status::Success);
+}
+#endif // ARMCOMPUTENEON_ENABLED
+
+#if ARMCOMPUTECL_ENABLED
+BOOST_AUTO_TEST_CASE(RuntimeValidateGpuDeviceSupportLayerNoFallback)
+{
+    // build up the structure of the network
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* input = net->AddInputLayer(0);
+
+    armnn::IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32));
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::Compute> backends = { armnn::Compute::GpuAcc };
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec());
+    BOOST_CHECK(optNet);
+
+    // Load it into the runtime. It should success.
+    armnn::NetworkId netId;
+    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == armnn::Status::Success);
+}
+#endif // ARMCOMPUTECL_ENABLED
+
+BOOST_AUTO_TEST_CASE(RuntimeCpuRef)
+{
+    using namespace armnn;
+
+    // Create runtime in which test will run
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    // build up the structure of the network
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc, should be fall back to CpuRef.
+    NormalizationDescriptor descriptor;
+    IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor);
+
+    IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
+    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+    normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+
+    // optimize the network
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef };
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    // Load it into the runtime. It should success.
+    armnn::NetworkId netId;
+    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success);
+}
+
+BOOST_AUTO_TEST_CASE(RuntimeFallbackToCpuRef)
+{
+    using namespace armnn;
+
+    // Create runtime in which test will run
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    // build up the structure of the network
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input = net->AddInputLayer(0);
+
+    // This layer configuration isn't supported by CpuAcc, should be fall back to CpuRef.
+    NormalizationDescriptor descriptor;
+    IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor);
+
+    IConnectableLayer* output = net->AddOutputLayer(0);
+
+    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
+    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+    normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+
+    // Allow fallback to CpuRef.
+    std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc, armnn::Compute::CpuRef };
+    // optimize the network
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    // Load it into the runtime. It should succeed.
+    armnn::NetworkId netId;
+    BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/TensorHelpers.hpp b/src/armnn/test/TensorHelpers.hpp
index aac4c1d15e..ec38940a44 100644
--- a/src/armnn/test/TensorHelpers.hpp
+++ b/src/armnn/test/TensorHelpers.hpp
@@ -39,7 +39,7 @@ struct SelectiveComparer<T, false>
 {
     static bool Compare(T a, T b)
     {
-        // if a or b is zero, percent_tolerance does an exact match, so compare to a small, constant tolerance instead
+        // If a or b is zero, percent_tolerance does an exact match, so compare to a small, constant tolerance instead.
         if (a == 0.0f || b == 0.0f)
         {
             return std::abs(a - b) <= g_FloatCloseToZeroTolerance;
@@ -62,7 +62,7 @@ template <typename T, std::size_t n>
 boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n>& a,
                                                    const boost::multi_array<T, n>& b)
 {
-    // check they are same shape
+    // Checks they are same shape.
     for (unsigned int i=0; i<n; i++)
     {
         if (a.shape()[i] != b.shape()[i])
@@ -77,9 +77,9 @@ boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n
         }
     }
 
-    // now compare element-wise
+    // Now compares element-wise.
 
-    // fun iteration over n dimensions
+    // Fun iteration over n dimensions.
     std::array<unsigned int, n> indices;
     for (unsigned int i = 0; i < n; i++)
     {
@@ -150,7 +150,7 @@ boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n
 }
 
 
-// Creates a boost::multi_array with shape defined by the given TensorInfo.
+// Creates a boost::multi_array with the shape defined by the given TensorInfo.
 template <typename T, std::size_t n>
 boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo)
 {
@@ -164,7 +164,7 @@ boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo)
     return boost::multi_array<T, n>(shape);
 }
 
-// Creates a boost::multi_array with shape defined by the given TensorInfo and contents defined by the given vector.
+// Creates a boost::multi_array with the shape defined by the given TensorInfo and contents defined by the given vector.
 template <typename T, std::size_t n>
 boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo, const std::vector<T>& flat)
 {
diff --git a/src/armnn/test/TensorTest.cpp b/src/armnn/test/TensorTest.cpp
index 2bb37f4fb8..8057d4dd7a 100644
--- a/src/armnn/test/TensorTest.cpp
+++ b/src/armnn/test/TensorTest.cpp
@@ -8,7 +8,7 @@
 namespace armnn
 {
 
-// Add unit test framework for interpreting TensorInfo type
+// Adds unit test framework for interpreting TensorInfo type.
 std::ostream& boost_test_print_type(std::ostream& ostr, const TensorInfo& right)
 {
     ostr << "TensorInfo[ "
@@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(TensorVsConstTensor)
     armnn::Tensor t(TensorInfo(), &mutableDatum);
     armnn::ConstTensor ct(TensorInfo(), &immutableDatum);
 
-    // Check that both Tensor and ConstTensor can be passed as a ConstTensor
+    // Checks that both Tensor and ConstTensor can be passed as a ConstTensor.
     CheckTensor(t);
     CheckTensor(ct);
 }
@@ -136,9 +136,9 @@ BOOST_AUTO_TEST_CASE(ModifyTensorInfo)
 BOOST_AUTO_TEST_CASE(TensorShapeOperatorBrackets)
 {
     TensorShape shape({0,1,2,3});
-    // Check version of operator[] which returns an unsigned int
+    // Checks version of operator[] which returns an unsigned int.
     BOOST_TEST(shape[2] == 2);
-    // Check the version of operator[] which returns a reference
+    // Checks the version of operator[] which returns a reference.
     shape[2] = 20;
     BOOST_TEST(shape[2] == 20);
 }
diff --git a/src/armnn/test/UnitTests.cpp b/src/armnn/test/UnitTests.cpp
index 0e2f99583f..203fbfe821 100644
--- a/src/armnn/test/UnitTests.cpp
+++ b/src/armnn/test/UnitTests.cpp
@@ -44,7 +44,7 @@ class SetupDebugOutput
 public:
     SetupDebugOutput()
     {
-        // Send the output to both cout (as standard) and the debug output.
+        // Sends the output to both cout (as standard) and the debug output.
         m_OutputStream.push(tee(std::cout));
         m_OutputStream.push(m_DebugOutputSink);
 
diff --git a/src/armnn/test/UnitTests.hpp b/src/armnn/test/UnitTests.hpp
index 9b750b5b33..8d5c7055e7 100644
--- a/src/armnn/test/UnitTests.hpp
+++ b/src/armnn/test/UnitTests.hpp
@@ -12,7 +12,7 @@
 
 inline void ConfigureLoggingTest()
 {
-    // Configure logging for both the ARMNN library and this test program
+    // Configures logging for both the ARMNN library and this test program.
     armnn::ConfigureLogging(true, true, armnn::LogSeverity::Fatal);
     armnnUtils::ConfigureLogging(boost::log::core::get().get(), true, true, armnn::LogSeverity::Fatal);
 }
@@ -43,9 +43,27 @@ void CompareTestResultIfSupported(const std::string& testName, const LayerTestRe
     }
 }
 
+template <typename T, std::size_t n>
+void CompareTestResultIfSupported(const std::string& testName, const std::vector<LayerTestResult<T, n>>& testResult)
+{
+    bool testNameIndicatesUnsupported = testName.find("UNSUPPORTED") != std::string::npos;
+    for (unsigned int i = 0; i < testResult.size(); ++i)
+    {
+        BOOST_CHECK_MESSAGE(testNameIndicatesUnsupported != testResult[i].supported,
+            "The test name does not match the supportedness it is reporting");
+        if (testResult[i].supported)
+        {
+            BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
+        }
+    }
+}
+
 template<typename FactoryType, typename TFuncPtr, typename... Args>
 void RunTestFunction(const char* testName, TFuncPtr testFunction, Args... args)
 {
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+    armnn::ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
+
     FactoryType workloadFactory;
     auto testResult = (*testFunction)(workloadFactory, args...);
     CompareTestResultIfSupported(testName, testResult);
diff --git a/src/armnn/test/UtilsTests.cpp b/src/armnn/test/UtilsTests.cpp
index 11fa51626c..2268aa31e2 100644
--- a/src/armnn/test/UtilsTests.cpp
+++ b/src/armnn/test/UtilsTests.cpp
@@ -4,10 +4,14 @@
 //
 #include <boost/test/unit_test.hpp>
 
+
 #include <armnn/Utils.hpp>
 #include <armnn/Types.hpp>
 #include <armnn/TypesUtils.hpp>
 #include <armnn/Descriptors.hpp>
+#include <GraphTopologicalSort.hpp>
+#include <Graph.hpp>
+#include "TypeUtils.hpp"
 
 BOOST_AUTO_TEST_SUITE(Utils)
 
@@ -55,4 +59,110 @@ BOOST_AUTO_TEST_CASE(PermuteDescriptorWithDuplicatedMappings)
     BOOST_CHECK_THROW(armnn::PermuteDescriptor({ 1u, 1u, 0u }), armnn::InvalidArgumentException);
 }
 
+BOOST_AUTO_TEST_CASE(HalfType)
+{
+    using namespace half_float::literal;
+    armnn::Half a = 1.0_h;
+
+    float b = 1.0f;
+    armnn::Half c(b);
+
+    // Test half type
+    BOOST_CHECK_EQUAL(a, b);
+    BOOST_CHECK_EQUAL(sizeof(c), 2);
+
+    // Test half type is floating point type
+    BOOST_CHECK(std::is_floating_point<armnn::Half>::value);
+
+    // Test utility function returns correct type.
+    using ResolvedType = armnn::ResolveType<armnn::DataType::Float16>;
+    constexpr bool isHalfType = std::is_same<armnn::Half, ResolvedType>::value;
+    BOOST_CHECK(isHalfType);
+
+    armnn::DataType dt = armnn::GetDataType<armnn::Half>();
+    BOOST_CHECK(dt == armnn::DataType::Float16);
+
+    //Test utility functions return correct size
+    BOOST_CHECK(GetDataTypeSize(armnn::DataType::Float16) == 2);
+
+    //Test utility functions return correct name
+    BOOST_CHECK((GetDataTypeName(armnn::DataType::Float16) == std::string("Float16")));
+}
+
+BOOST_AUTO_TEST_CASE(GraphTopologicalSortSimpleTest)
+{
+    std::map<int, std::vector<int>> graph;
+
+    graph[0] = {2};
+    graph[1] = {3};
+    graph[2] = {4};
+    graph[3] = {4};
+    graph[4] = {5};
+    graph[5] = {};
+
+    auto getNodeInputs = [graph](int node) -> std::vector<int>
+    {
+        return graph.find(node)->second;
+    };
+
+    std::vector<int> targetNodes = {0, 1};
+
+    std::vector<int> output;
+    bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output);
+
+    BOOST_TEST(sortCompleted);
+
+    std::vector<int> correctResult = {5, 4, 2, 0, 3, 1};
+    BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), correctResult.begin(), correctResult.end());
+}
+
+BOOST_AUTO_TEST_CASE(GraphTopologicalSortVariantTest)
+{
+    std::map<int, std::vector<int>> graph;
+
+    graph[0] = {2};
+    graph[1] = {2};
+    graph[2] = {3, 4};
+    graph[3] = {5};
+    graph[4] = {5};
+    graph[5] = {6};
+    graph[6] = {};
+
+    auto getNodeInputs = [graph](int node) -> std::vector<int>
+    {
+        return graph.find(node)->second;
+    };
+
+    std::vector<int> targetNodes = {0, 1};
+
+    std::vector<int> output;
+    bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output);
+
+    BOOST_TEST(sortCompleted);
+
+    std::vector<int> correctResult = {6, 5, 3, 4, 2, 0, 1};
+    BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), correctResult.begin(), correctResult.end());
+}
+
+BOOST_AUTO_TEST_CASE(CyclicalGraphTopologicalSortTest)
+{
+    std::map<int, std::vector<int>> graph;
+
+    graph[0] = {1};
+    graph[1] = {2};
+    graph[2] = {0};
+
+    auto getNodeInputs = [graph](int node) -> std::vector<int>
+    {
+        return graph.find(node)->second;
+    };
+
+    std::vector<int> targetNodes = {0};
+
+    std::vector<int> output;
+    bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output);
+
+    BOOST_TEST(!sortCompleted);
+}
+
 BOOST_AUTO_TEST_SUITE_END()