From 652bde553f506caac4c563988dc9baf746f9584d Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 10 Jan 2018 15:33:28 +0000
Subject: COMPMID-674 - Create Google InceptionV3 example

Change-Id: I389e0d4104b7dde60b7cdd612a83f3328517e44c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115804
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/SubTensorInfo.h                   |  14 +-
 arm_compute/graph/SubTensor.h                      |  33 +-
 arm_compute/runtime/CL/CLSubTensor.h               |  11 +-
 arm_compute/runtime/SubTensor.h                    |  11 +-
 examples/graph_inception_v3.cpp                    | 767 +++++++++++++++++++++
 src/core/CL/cl_kernels/batchnormalization_layer.cl |   4 +-
 src/core/SubTensorInfo.cpp                         |  53 +-
 src/core/TensorInfo.cpp                            |   4 +-
 src/graph/SubTensor.cpp                            |  20 +-
 src/graph/nodes/BranchLayer.cpp                    |  65 +-
 src/graph/nodes/ConvolutionLayer.cpp               |  22 +-
 src/runtime/CL/CLSubTensor.cpp                     |   6 +-
 src/runtime/SubTensor.cpp                          |   6 +-
 utils/GraphUtils.cpp                               |  21 +-
 utils/GraphUtils.h                                 |  46 +-
 15 files changed, 944 insertions(+), 139 deletions(-)
 create mode 100644 examples/graph_inception_v3.cpp

diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 7c464c0b17..67574f1326 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,12 +46,13 @@ public:
     SubTensorInfo();
     /** Default constructor
      *
-     * @param[in] parent       Metadata of parent tensor.
-     * @param[in] tensor_shape Tensor shape. Shape must fit inside parent's shape.
-     *                         X and Y dimensions must match the parent's ones.
-     * @param[in] coords       Coordinates of starting element inside parent tensor.
+     * @param[in] parent        Metadata of parent tensor.
+     * @param[in] tensor_shape  Tensor shape. Shape must fit inside parent's shape.
+     *                          X and Y dimensions must match the parent's ones.
+     * @param[in] coords        Coordinates of starting element inside parent tensor.
+     * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords);
+    SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent = false);
     /** Default destructor */
     ~SubTensorInfo() = default;
     /** Allow instances of this class to be copy constructed */
@@ -216,6 +217,7 @@ private:
     TensorShape  _tensor_shape;
     Coordinates  _coords;
     ValidRegion  _valid_region;
+    bool         _extend_parent;
 };
 }
 #endif /*__ARM_COMPUTE_SUBTENSORINFO_H__ */
diff --git a/arm_compute/graph/SubTensor.h b/arm_compute/graph/SubTensor.h
index 72aa789274..43b835d49c 100644
--- a/arm_compute/graph/SubTensor.h
+++ b/arm_compute/graph/SubTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,19 +44,21 @@ public:
     SubTensor();
     /** Constructor
      *
-     * @param[in] parent       Parent to create sub-tensor from
-     * @param[in] tensor_shape Sub-tensor shape
-     * @param[in] coords       Starting coordinates of the sub-tensor in the parent tensor
+     * @param[in] parent        Parent to create sub-tensor from
+     * @param[in] tensor_shape  Sub-tensor shape
+     * @param[in] coords        Starting coordinates of the sub-tensor in the parent tensor
+     * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords);
+    SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent = false);
     /** Constructor
      *
-     * @param[in] parent       Parent to create sub-tensor from
-     * @param[in] tensor_shape Sub-tensor shape
-     * @param[in] coords       Starting coordinates of the sub-tensor in the parent tensor
-     * @param[in] target       Execution target
+     * @param[in] parent        Parent to create sub-tensor from
+     * @param[in] tensor_shape  Sub-tensor shape
+     * @param[in] coords        Starting coordinates of the sub-tensor in the parent tensor
+     * @param[in] target        Execution target
+     * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target);
+    SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target, bool extend_parent = false);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     SubTensor(const SubTensor &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -82,11 +84,12 @@ private:
     void instantiate_subtensor();
 
 private:
-    TargetHint                            _target;       /**< Target that this tensor is pinned on */
-    TensorShape                           _tensor_shape; /**< SubTensor shape */
-    Coordinates                           _coords;       /**< SubTensor Coordinates */
-    arm_compute::ITensor                 *_parent;       /**< Parent tensor */
-    std::unique_ptr<arm_compute::ITensor> _subtensor;    /**< SubTensor */
+    TargetHint                            _target;        /**< Target that this tensor is pinned on */
+    TensorShape                           _tensor_shape;  /**< SubTensor shape */
+    Coordinates                           _coords;        /**< SubTensor Coordinates */
+    arm_compute::ITensor                 *_parent;        /**< Parent tensor */
+    std::unique_ptr<arm_compute::ITensor> _subtensor;     /**< SubTensor */
+    bool                                  _extend_parent; /**< Parent extension flag */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 4bab164779..b6e9a29df4 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,11 +39,12 @@ class CLSubTensor : public ICLTensor
 public:
     /** Constructor
      *
-     * @param[in] parent       Parent tensor
-     * @param[in] tensor_shape Shape of the subtensor
-     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     * @param[in] parent        Parent tensor
+     * @param[in] tensor_shape  Shape of the subtensor
+     * @param[in] coords        Coordinates of the first subtensor element inside the parent tensor.
+     * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensor() = default;
     /** Restrict instances of this class to be copy constructed */
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index bdb229de49..ba2f8682af 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,11 +39,12 @@ class SubTensor : public ITensor
 public:
     /** Constructor
      *
-     * @param[in] parent       Parent tensor
-     * @param[in] tensor_shape Shape of the subtensor
-     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     * @param[in] parent        Parent tensor
+     * @param[in] tensor_shape  Shape of the subtensor
+     * @param[in] coords        Coordinates of the first subtensor element inside the parent tensor.
+     * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~SubTensor() = default;
     /** Restrict instances of this class to be copy constructed */
diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
new file mode 100644
index 0000000000..9bc82977c7
--- /dev/null
+++ b/examples/graph_inception_v3.cpp
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Nodes.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "support/ToolchainSupport.h"
+#include "utils/GraphUtils.h"
+#include "utils/Utils.h"
+
+#include <cstdlib>
+#include <tuple>
+
+using namespace arm_compute::utils;
+using namespace arm_compute::graph;
+using namespace arm_compute::graph_utils;
+
+/** Example demonstrating how to implement InceptionV3's network using the Compute Library's graph API
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to the weights folder, [optional] image, [optional] labels )
+ */
+class InceptionV3Example : public Example
+{
+public:
+    void do_setup(int argc, char **argv) override
+    {
+        std::string data_path; /* Path to the trainable data */
+        std::string image;     /* Image data */
+        std::string label;     /* Label data */
+
+        constexpr float mean = 0.f;   /* Mean value to subtract from the channels */
+        constexpr float std  = 255.f; /* Standard deviation value to divide from the channels */
+
+        // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
+        TargetHint            target_hint      = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
+        ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::DIRECT;
+
+        // Parse arguments
+        if(argc < 2)
+        {
+            // Print help
+            std::cout << "Usage: " << argv[0] << " [target] [path_to_data] [image] [labels]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 2)
+        {
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " [path_to_data] [image] [labels]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 3)
+        {
+            data_path = argv[2];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " [image] [labels]\n\n";
+            std::cout << "No image provided: using random values\n\n";
+        }
+        else if(argc == 4)
+        {
+            data_path = argv[2];
+            image     = argv[3];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " [labels]\n\n";
+            std::cout << "No text file with labels provided: skipping output accessor\n\n";
+        }
+        else
+        {
+            data_path = argv[2];
+            image     = argv[3];
+            label     = argv[4];
+        }
+
+        graph << target_hint << convolution_hint << Tensor(TensorInfo(TensorShape(299U, 299U, 3U, 1U), 1, DataType::F32),
+                                                           get_input_accessor(image,
+                                                                              mean, mean, mean,
+                                                                              std, std, std, false /* Do not convert to BGR */))
+
+              << ConvolutionLayer(3U, 3U, 32U,
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy"),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+              << BatchNormalizationLayer(get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                                         get_random_accessor(1.f, 1.f), get_weights_accessor(data_path,
+                                                                                             "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                                         0.001f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+              << ConvolutionLayer(3U, 3U, 32U,
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy"),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+              << BatchNormalizationLayer(get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
+                                         get_random_accessor(1.f, 1.f), get_weights_accessor(data_path,
+                                                                                             "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
+                                         0.001f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+              << ConvolutionLayer(3U, 3U, 64U,
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy"),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+              << BatchNormalizationLayer(get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
+                                         get_random_accessor(1.f, 1.f), get_weights_accessor(data_path,
+                                                                                             "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
+                                         0.001f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+
+              << ConvolutionLayer(1U, 1U, 80U,
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy"),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+              << BatchNormalizationLayer(get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
+                                         get_random_accessor(1.f, 1.f), get_weights_accessor(data_path,
+                                                                                             "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_beta.npy"),
+                                         0.001f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+              << ConvolutionLayer(3U, 3U, 192U,
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy"),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+              << BatchNormalizationLayer(get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path,
+                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
+                                         get_random_accessor(1.f, 1.f), get_weights_accessor(data_path,
+                                                                                             "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_beta.npy"),
+                                         0.001f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+
+              << get_inception_node_A(data_path, "Mixed_5b", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+                                      32U)
+              << get_inception_node_A(data_path, "Mixed_5c", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+                                      64U, true)
+              << get_inception_node_A(data_path, "Mixed_5d", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+                                      64U)
+
+              << get_inception_node_B(data_path, "Mixed_6a", 384U, std::make_tuple(64U, 96U, 96U))
+
+              << get_inception_node_C(data_path, "Mixed_6b", 192U, std::make_tuple(128U, 128U, 192U),
+                                      std::make_tuple(128U, 128U, 128U, 128U, 192U), 192U)
+              << get_inception_node_C(data_path, "Mixed_6c", 192U, std::make_tuple(160U, 160U, 192U),
+                                      std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
+              << get_inception_node_C(data_path, "Mixed_6d", 192U, std::make_tuple(160U, 160U, 192U),
+                                      std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
+              << get_inception_node_C(data_path, "Mixed_6e", 192U, std::make_tuple(192U, 192U, 192U),
+                                      std::make_tuple(192U, 192U, 192U, 192U, 192U), 192U)
+
+              << get_inception_node_D(data_path, "Mixed_7a", std::make_tuple(192U, 320U),
+                                      std::make_tuple(192U, 192U, 192U, 192U))
+
+              << get_inception_node_E(data_path, "Mixed_7b", 320U, std::make_tuple(384U, 384U, 384U),
+                                      std::make_tuple(448U, 384U, 384U, 384U), 192U)
+              << get_inception_node_E(data_path, "Mixed_7c", 320U, std::make_tuple(384U, 384U, 384U),
+                                      std::make_tuple(448U, 384U, 384U, 384U), 192U, true)
+
+              << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
+              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path,
+                                                                      "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy"),
+                                  get_weights_accessor(data_path,
+                                                       "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
+                                  PadStrideInfo(1, 1, 0, 0))
+              << ReshapeLayer(TensorShape(1001U)) << SoftmaxLayer()
+              << Tensor(get_output_accessor(label, 5));
+    }
+
+    void do_run() override
+    {
+        graph.run();
+    }
+
+private:
+    Graph graph{};
+
+private:
+    BranchLayer get_inception_node_A(const std::string &data_path, std::string &&param_path,
+                                     unsigned int a_filt,
+                                     std::tuple<unsigned int, unsigned int> b_filters,
+                                     std::tuple<unsigned int, unsigned int, unsigned int> c_filters,
+                                     unsigned int d_filt,
+                                     bool         is_name_different = false)
+    {
+        std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
+        std::cout << total_path << std::endl;
+
+        // This is due to a naming issue in the tf model
+        std::string conv_id0 = "_0a_";
+        std::string conv_id1 = "2d_0b_";
+        if(is_name_different)
+        {
+            conv_id0 = "_0b_";
+            conv_id1 = "_1_0c_";
+        }
+
+        SubGraph i_a;
+        i_a << ConvolutionLayer(
+                1U, 1U, a_filt,
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b;
+        i_b << ConvolutionLayer(
+                1U, 1U, std::get<0>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                5U, 5U, std::get<1>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 2, 2))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c;
+        i_c << ConvolutionLayer(
+                1U, 1U, std::get<0>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<1>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 1, 1))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<2>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 1, 1))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_d;
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+            << ConvolutionLayer(
+                1U, 1U, d_filt,
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
+    }
+
+    BranchLayer get_inception_node_B(const std::string &data_path, std::string &&param_path,
+                                     unsigned int a_filt,
+                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters)
+    {
+        std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
+        SubGraph    i_a;
+        i_a << ConvolutionLayer(
+                3U, 3U, a_filt,
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(2, 2, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b;
+        i_b << ConvolutionLayer(
+                1U, 1U, std::get<0>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<1>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 1, 1))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<2>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(2, 2, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c;
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+
+        return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
+    }
+
+    BranchLayer get_inception_node_C(const std::string &data_path, std::string &&param_path,
+                                     unsigned int a_filt,
+                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+                                     std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
+                                     unsigned int d_filt)
+    {
+        std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
+        SubGraph    i_a;
+        i_a << ConvolutionLayer(
+                1U, 1U, a_filt,
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b;
+        i_b << ConvolutionLayer(
+                1U, 1U, std::get<0>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                7U, 1U, std::get<1>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 3, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                1U, 7U, std::get<2>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 3))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c;
+        i_c << ConvolutionLayer(
+                1U, 1U, std::get<0>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                1U, 7U, std::get<1>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 3))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                7U, 1U, std::get<2>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 3, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                1U, 7U, std::get<3>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 3))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                7U, 1U, std::get<4>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 3, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_d;
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+            << ConvolutionLayer(
+                1U, 1U, d_filt,
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
+    }
+
+    BranchLayer get_inception_node_D(const std::string &data_path, std::string &&param_path,
+                                     std::tuple<unsigned int, unsigned int>      a_filters,
+                                     std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> b_filters)
+    {
+        std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
+        SubGraph    i_a;
+        i_a << ConvolutionLayer(
+                1U, 1U, std::get<0>(a_filters),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<1>(a_filters),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(2, 2, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b;
+        i_b << ConvolutionLayer(
+                1U, 1U, std::get<0>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                7U, 1U, std::get<1>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 3, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                1U, 7U, std::get<2>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 3))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<3>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(2, 2, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c;
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+
+        return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
+    }
+
+    BranchLayer get_inception_node_E(const std::string &data_path, std::string &&param_path,
+                                     unsigned int a_filt,
+                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+                                     std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
+                                     unsigned int d_filt,
+                                     bool         is_name_different = false)
+    {
+        // This is due to a naming issue in the tf model
+        std::string conv_id = "_0b_";
+        if(is_name_different)
+        {
+            conv_id = "_0c_";
+        }
+
+        std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
+        SubGraph    i_a;
+        i_a << ConvolutionLayer(
+                1U, 1U, a_filt,
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b1;
+        i_b1 << ConvolutionLayer(
+                 3U, 1U, std::get<1>(b_filters),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 1, 0))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b2;
+        i_b2 << ConvolutionLayer(
+                 1U, 3U, std::get<2>(b_filters),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 0, 1))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_b;
+        i_b << ConvolutionLayer(
+                1U, 1U, std::get<0>(b_filters),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_b1), std::move(i_b2));
+
+        SubGraph i_c1;
+        i_c1 << ConvolutionLayer(
+                 3U, 1U, std::get<2>(c_filters),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 1, 0))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c2;
+        i_c2 << ConvolutionLayer(
+                 1U, 3U, std::get<3>(c_filters),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 0, 1))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubGraph i_c;
+        i_c << ConvolutionLayer(
+                1U, 1U, std::get<0>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << ConvolutionLayer(
+                3U, 3U, std::get<1>(c_filters),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 1, 1))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+            << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_c1), std::move(i_c2));
+
+        SubGraph i_d;
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+            << ConvolutionLayer(
+                1U, 1U, d_filt,
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
+    }
+};
+
+/** Main program for Inception V3
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels )
+ */
+int main(int argc, char **argv)
+{
+    return arm_compute::utils::run_example<InceptionV3Example>(argc, argv);
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index f7aa5eb518..fbffefb3c0 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ __kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
     numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
     beta_vec  = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
 
     VSTORE(VEC_SIZE)
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 7a4886ff60..0150a95cc6 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,17 +30,49 @@
 
 using namespace arm_compute;
 
+namespace
+{
+/** Extends parent shape depending on subtensor's coordinates and shape
+ *
+ * @param parent_shape Parent shape
+ * @param shape        Subtensor shape
+ * @param coords       Subtensor coordinates inside parent tensor
+ *
+ * @return Extended parent shape
+ */
+TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
+{
+    // Subtensor should not index in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON((coords.x() != 0) || (coords.y() != 0));
+
+    // Cannot extend on x, y ?
+    ARM_COMPUTE_ERROR_ON((parent_shape.total_size() != 0) && (parent_shape.x() != shape.x()) && (parent_shape.y() != shape.y()));
+
+    // Extend shape
+    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    {
+        int dimension_extend = coords[i] + static_cast<int>(shape[i]);
+        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        {
+            parent_shape.set(i, static_cast<size_t>(dimension_extend));
+        }
+    }
+
+    return parent_shape;
+}
+} // namespace
+
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false)
 {
 }
 
-SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords)
-    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
+    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0)
+    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
@@ -63,11 +95,19 @@ std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
 ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0)
+    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
     }
+    else if(_extend_parent) // Extend parent shape, configure if specified
+    {
+        ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
+        TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
+        _parent->set_tensor_shape(parent_extended_shape);
+        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+    }
     _tensor_shape = shape;
     return *this;
 }
@@ -76,6 +116,7 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
     ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+    ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Extend parent padding if required
     return _parent->extend_padding(padding);
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 2008217c85..24988e2217 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -364,6 +364,8 @@ ITensorInfo &TensorInfo::set_tensor_shape(TensorShape shape)
         _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
     }
 
+    std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+
     _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
     return *this;
 }
diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
index 2edeb3b1d4..2e640dd93c 100644
--- a/src/graph/SubTensor.cpp
+++ b/src/graph/SubTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,21 +37,21 @@ using namespace arm_compute::graph;
 namespace
 {
 template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords)
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
     auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
-    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords, extend_parent);
     return std::move(subtensor);
 }
 } // namespace
 
 SubTensor::SubTensor()
-    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr), _extend_parent(false)
 {
 }
 
-SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords)
-    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr)
+SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr), _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
     _parent = parent.tensor();
@@ -60,8 +60,8 @@ SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coord
     instantiate_subtensor();
 }
 
-SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
-    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr)
+SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target, bool extend_parent)
+    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr), _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
     instantiate_subtensor();
@@ -108,10 +108,10 @@ void SubTensor::instantiate_subtensor()
     switch(_target)
     {
         case TargetHint::OPENCL:
-            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords);
+            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords, _extend_parent);
             break;
         case TargetHint::NEON:
-            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords);
+            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords, _extend_parent);
             break;
         default:
             ARM_COMPUTE_ERROR("Invalid TargetHint");
diff --git a/src/graph/nodes/BranchLayer.cpp b/src/graph/nodes/BranchLayer.cpp
index 6352bfc1e3..7a20a565b8 100644
--- a/src/graph/nodes/BranchLayer.cpp
+++ b/src/graph/nodes/BranchLayer.cpp
@@ -37,46 +37,6 @@
 
 using namespace arm_compute::graph;
 
-namespace
-{
-void depth_concatenate_output_info(ITensorInfo *info, ITensorInfo *sub_tensor_info)
-{
-    ARM_COMPUTE_ERROR_ON(info == nullptr);
-    ARM_COMPUTE_ERROR_ON(sub_tensor_info == nullptr);
-
-    TensorShape        info_shape            = info->tensor_shape();
-    const TensorShape &sub_tensor_info_shape = sub_tensor_info->tensor_shape();
-
-    // Update parent info and valid region
-    if(info_shape.total_size() == 0)
-    {
-        arm_compute::auto_init_if_empty(*info,
-                                        sub_tensor_info->tensor_shape(),
-                                        sub_tensor_info->num_channels(),
-                                        sub_tensor_info->data_type(), sub_tensor_info->fixed_point_position(), sub_tensor_info->quantization_info());
-        info->set_valid_region(sub_tensor_info->valid_region());
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(info->num_channels() != sub_tensor_info->num_channels());
-        ARM_COMPUTE_ERROR_ON(info->data_type() != sub_tensor_info->data_type());
-        ARM_COMPUTE_ERROR_ON(info->fixed_point_position() != sub_tensor_info->fixed_point_position());
-
-        // Concatenate depth
-        ARM_COMPUTE_ERROR_ON(info_shape.x() != sub_tensor_info_shape.x());
-        ARM_COMPUTE_ERROR_ON(info_shape.y() != sub_tensor_info_shape.y());
-        info_shape.set(2, info_shape.z() + sub_tensor_info_shape.z());
-        info->set_tensor_shape(info_shape);
-
-        // Update valid region
-        arm_compute::ValidRegion info_valid_region = info->valid_region();
-        info_valid_region.shape.set(2, info_shape.z());
-        arm_compute::ValidRegion updated_region = arm_compute::intersect_valid_regions(info_valid_region, sub_tensor_info->valid_region());
-        info->set_valid_region(updated_region);
-    }
-}
-} // namespace
-
 /** Branch function */
 class BranchFunction final : public arm_compute::IFunction
 {
@@ -117,9 +77,8 @@ std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphConte
     // Create branch function
     auto func = arm_compute::support::cpp14::make_unique<BranchFunction>();
 
-    // Track output SubTensorInfo and depth
-    TensorInfo out_info;
-    int        depth = 0;
+    // Track output depth
+    int depth = 0;
 
     // Constuct all sub-graphs given the input/output
     for(auto &sg : _sub_graphs)
@@ -143,10 +102,13 @@ std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphConte
         // Create output sub-tensor
         if(!sg->has_output())
         {
-            ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
-            out = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(output),
-                                                                      output->tensor()->info()->tensor_shape(),
-                                                                      Coordinates(0, 0, depth));
+            ARM_COMPUTE_ERROR_ON((dynamic_cast<Tensor *>(output) == nullptr) && (dynamic_cast<SubTensor *>(output) == nullptr));
+
+            out = arm_compute::support::cpp14::make_unique<SubTensor>(output->tensor(),
+                                                                      TensorShape(),
+                                                                      Coordinates(0, 0, depth),
+                                                                      output->target(),
+                                                                      true);
             out_sub_tensor = dynamic_cast<SubTensor *>(out.get());
         }
 
@@ -161,17 +123,8 @@ std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphConte
         {
             ARM_COMPUTE_ERROR_ON(out_sub_tensor->tensor() == nullptr);
             depth += out_sub_tensor->tensor()->info()->tensor_shape()[2];
-            depth_concatenate_output_info(&out_info, out_sub_tensor->tensor()->info());
         }
     }
 
-    // Auto-init output
-    arm_compute::auto_init_if_empty(*output->tensor()->info(),
-                                    out_info.tensor_shape(),
-                                    out_info.num_channels(),
-                                    out_info.data_type(),
-                                    out_info.fixed_point_position(),
-                                    out_info.quantization_info());
-
     return std::move(func);
 }
\ No newline at end of file
diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
index 53d06ea75f..f292b893ed 100644
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ b/src/graph/nodes/ConvolutionLayer.cpp
@@ -106,13 +106,16 @@ std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(arm_comp
                                                                         const WeightsInfo    &weights_info,
                                                                         ConvolutionMethodHint conv_method)
 {
-    if(conv_method == ConvolutionMethodHint::GEMM)
+    if((conv_method == ConvolutionMethodHint::DIRECT)
+       && arm_compute::CLDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
     {
-        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDirectConvolutionLayer");
+        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
     }
     else
     {
-        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
+        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
     }
 }
 
@@ -122,13 +125,16 @@ std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(arm_comput
                                                                       const WeightsInfo    &weights_info,
                                                                       ConvolutionMethodHint conv_method)
 {
-    if(conv_method == ConvolutionMethodHint::GEMM)
+    if((conv_method == ConvolutionMethodHint::DIRECT)
+       && arm_compute::NEDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
     {
-        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDirectConvolutionLayer");
+        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
     }
     else
     {
-        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
+        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
     }
 }
 } // namespace
@@ -258,12 +264,10 @@ std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_convolutio
     std::unique_ptr<arm_compute::IFunction> func;
     if(_target_hint == TargetHint::OPENCL)
     {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
         func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
         func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     return func;
@@ -325,12 +329,10 @@ std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_grouped_co
         // Instantiate convolution function
         if(_target_hint == TargetHint::OPENCL)
         {
-            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
             func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
         else
         {
-            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
             func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
 
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index b228c0abda..5f58024b0e 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,11 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords, extend_parent);
     _parent = parent;
 }
 
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index 32924be3dc..c5b8f33c9a 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,11 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords, extend_parent);
     _parent = parent;
 }
 
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index 6b3dffc1a4..2ff40b7fbb 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -30,13 +30,7 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #endif /* ARM_COMPUTE_CL */
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-
-#include <algorithm>
 #include <iomanip>
-#include <ostream>
-#include <random>
 
 using namespace arm_compute::graph_utils;
 
@@ -80,8 +74,10 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
     return ret;
 }
 
-PPMAccessor::PPMAccessor(const std::string &ppm_path, bool bgr, float mean_r, float mean_g, float mean_b)
-    : _ppm_path(ppm_path), _bgr(bgr), _mean_r(mean_r), _mean_g(mean_g), _mean_b(mean_b)
+PPMAccessor::PPMAccessor(std::string ppm_path, bool bgr,
+                         float mean_r, float mean_g, float mean_b,
+                         float std_r, float std_g, float std_b)
+    : _ppm_path(std::move(ppm_path)), _bgr(bgr), _mean_r(mean_r), _mean_g(mean_g), _mean_b(mean_b), _std_r(std_r), _std_g(std_g), _std_b(std_b)
 {
 }
 
@@ -94,6 +90,12 @@ bool PPMAccessor::access_tensor(ITensor &tensor)
         _mean_g,
         _bgr ? _mean_r : _mean_b
     };
+    const float std[3] =
+    {
+        _bgr ? _std_b : _std_r,
+        _std_g,
+        _bgr ? _std_r : _std_b
+    };
 
     // Open PPM file
     ppm.open(_ppm_path);
@@ -111,7 +113,7 @@ bool PPMAccessor::access_tensor(ITensor &tensor)
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const float value                                     = *reinterpret_cast<float *>(tensor.ptr_to_element(id)) - mean[id.z()];
-        *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = value;
+        *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = value / std[id.z()];
     });
 
     return true;
@@ -330,6 +332,7 @@ bool NumPyBinLoader::access_tensor(ITensor &tensor)
 
     // Validate tensor shape
     ARM_COMPUTE_ERROR_ON_MSG(shape.size() != tensor_shape.num_dimensions(), "Tensor ranks mismatch");
+
     if(fortran_order)
     {
         for(size_t i = 0; i < shape.size(); ++i)
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index e97bbf1c49..da52c26520 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -90,8 +90,13 @@ public:
      * @param[in] mean_r   (Optional) Red mean value to be subtracted from red channel
      * @param[in] mean_g   (Optional) Green mean value to be subtracted from green channel
      * @param[in] mean_b   (Optional) Blue mean value to be subtracted from blue channel
+     * @param[in] std_r    (Optional) Red standard deviation value to be divided from red channel
+     * @param[in] std_g    (Optional) Green standard deviation value to be divided from green channel
+     * @param[in] std_b    (Optional) Blue standard deviation value to be divided from blue channel
      */
-    PPMAccessor(const std::string &ppm_path, bool bgr = true, float mean_r = 0.0f, float mean_g = 0.0f, float mean_b = 0.0f);
+    PPMAccessor(std::string ppm_path, bool bgr = true,
+                float mean_r = 0.0f, float mean_g = 0.0f, float mean_b = 0.0f,
+                float std_r = 1.f, float std_g = 1.f, float std_b = 1.f);
     /** Allow instances of this class to be move constructed */
     PPMAccessor(PPMAccessor &&) = default;
 
@@ -99,11 +104,14 @@ public:
     bool access_tensor(ITensor &tensor) override;
 
 private:
-    const std::string &_ppm_path;
-    const bool         _bgr;
-    const float        _mean_r;
-    const float        _mean_g;
-    const float        _mean_b;
+    const std::string _ppm_path;
+    const bool        _bgr;
+    const float       _mean_r;
+    const float       _mean_g;
+    const float       _mean_b;
+    const float       _std_r;
+    const float       _std_g;
+    const float       _std_b;
 };
 
 /** Result accessor class */
@@ -180,6 +188,19 @@ private:
     const std::string _filename;
 };
 
+/** Generates appropriate random accessor
+ *
+ * @param[in] lower Lower random values bound
+ * @param[in] upper Upper random values bound
+ * @param[in] seed  Random generator seed
+ *
+ * @return A ramdom accessor
+ */
+inline std::unique_ptr<graph::ITensorAccessor> get_random_accessor(PixelValue lower, PixelValue upper, const std::random_device::result_type seed = 0)
+{
+    return arm_compute::support::cpp14::make_unique<RandomAccessor>(lower, upper, seed);
+}
+
 /** Generates appropriate weights accessor according to the specified path
  *
  * @note If path is empty will generate a DummyAccessor else will generate a NumPyBinLoader
@@ -209,10 +230,17 @@ inline std::unique_ptr<graph::ITensorAccessor> get_weights_accessor(const std::s
  * @param[in] mean_r   Red mean value to be subtracted from red channel
  * @param[in] mean_g   Green mean value to be subtracted from green channel
  * @param[in] mean_b   Blue mean value to be subtracted from blue channel
+ * @param[in] std_r    (Optional) Red standard deviation value to be divided from red channel
+ * @param[in] std_g    (Optional) Green standard deviation value to be divided from green channel
+ * @param[in] std_b    (Optional) Blue standard deviation value to be divided from blue channel
+ * @param[in] bgr      (Optional) Fill the first plane with blue channel (default = true)
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const std::string &ppm_path, float mean_r, float mean_g, float mean_b)
+inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const std::string &ppm_path,
+                                                                  float mean_r = 0.f, float mean_g = 0.f, float mean_b = 0.f,
+                                                                  float std_r = 1.f, float std_g = 1.f, float std_b = 1.f,
+                                                                  bool bgr = true)
 {
     if(ppm_path.empty())
     {
@@ -220,7 +248,9 @@ inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const std::str
     }
     else
     {
-        return arm_compute::support::cpp14::make_unique<PPMAccessor>(ppm_path, true, mean_r, mean_g, mean_b);
+        return arm_compute::support::cpp14::make_unique<PPMAccessor>(ppm_path, bgr,
+                                                                     mean_r, mean_g, mean_b,
+                                                                     std_r, std_g, std_b);
     }
 }
 
-- 
cgit v1.2.1