From e2220551b7a64b929650ba9a60529c31e70c13c5 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 20 Jul 2018 13:23:44 +0100
Subject: COMPMID-1367: Enable NHWC in graph examples

Change-Id: Iabc54a3a1bdcd46a9a921cda39c7c85fef672b72
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141449
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLNormalizationLayerKernel.h   |   2 +-
 arm_compute/core/Helpers.h                         |  60 +++++---
 arm_compute/core/utils/misc/ShapeCalculator.h      |  30 ++--
 arm_compute/graph/GraphBuilder.h                   |   9 +-
 arm_compute/graph/INodeVisitor.h                   |  16 +-
 arm_compute/graph/TypePrinter.h                    |  44 +++---
 arm_compute/graph/Types.h                          |  24 +--
 arm_compute/graph/backends/FunctionHelpers.h       | 116 ++++++++-------
 arm_compute/graph/backends/ValidateHelpers.h       |  16 +-
 arm_compute/graph/frontend/Layers.h                |   6 +-
 arm_compute/graph/frontend/Types.h                 |   6 +-
 arm_compute/graph/nodes/ConcatenateLayerNode.h     |  84 +++++++++++
 arm_compute/graph/nodes/ConvolutionLayerNode.h     |   6 +-
 .../graph/nodes/DepthConcatenateLayerNode.h        |  77 ----------
 .../graph/nodes/DepthwiseConvolutionLayerNode.h    |   2 +-
 arm_compute/graph/nodes/Nodes.h                    |   2 +-
 arm_compute/graph/nodes/NodesFwd.h                 |   2 +-
 arm_compute/graph/printers/DotGraphPrinter.h       |   2 +-
 examples/graph_alexnet.cpp                         |  27 ++--
 examples/graph_googlenet.cpp                       |  51 ++++---
 examples/graph_inception_v3.cpp                    | 129 ++++++++--------
 examples/graph_inception_v4.cpp                    | 163 +++++++++++----------
 examples/graph_mobilenet.cpp                       |   2 +-
 examples/graph_resnet50.cpp                        |  35 +++--
 examples/graph_resnext50.cpp                       |  32 ++--
 examples/graph_squeezenet.cpp                      |  54 ++++---
 examples/graph_squeezenet_v1_1.cpp                 |  58 ++++----
 src/core/CL/cl_kernels/pooling_layer.cl            |   4 +-
 src/core/CL/kernels/CLIm2ColKernel.cpp             |   1 -
 src/core/CL/kernels/CLNormalizationLayerKernel.cpp |  22 +--
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       |   4 +-
 .../NEON/kernels/NENormalizationLayerKernel.cpp    |  50 ++++---
 src/graph/GraphBuilder.cpp                         |  18 ++-
 src/graph/backends/CL/CLFunctionsFactory.cpp       |   4 +-
 src/graph/backends/GLES/GCFunctionsFactory.cpp     |  50 ++++++-
 src/graph/backends/GLES/GCNodeValidator.cpp        |   6 +-
 src/graph/backends/NEON/NEFunctionFactory.cpp      |   8 +-
 src/graph/mutators/DepthConcatSubTensorMutator.cpp |  14 +-
 src/graph/nodes/ConcatenateLayerNode.cpp           | 141 ++++++++++++++++++
 src/graph/nodes/DepthConcatenateLayerNode.cpp      | 125 ----------------
 src/graph/printers/DotGraphPrinter.cpp             |  10 +-
 utils/CommonGraphOptions.cpp                       |   6 +-
 utils/CommonGraphOptions.h                         |   2 +-
 utils/TypePrinter.h                                |  29 ++++
 44 files changed, 868 insertions(+), 681 deletions(-)
 create mode 100644 arm_compute/graph/nodes/ConcatenateLayerNode.h
 delete mode 100644 arm_compute/graph/nodes/DepthConcatenateLayerNode.h
 create mode 100644 src/graph/nodes/ConcatenateLayerNode.cpp
 delete mode 100644 src/graph/nodes/DepthConcatenateLayerNode.cpp

diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
index f2d37a781c..beeb8b838e 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -72,7 +72,7 @@ private:
     const ICLTensor *_input;
     ICLTensor       *_output;
     BorderSize       _border_size;
-    bool             _is_in_map;
+    bool             _is_norm_across_width;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 374e36442b..ef59323073 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -111,28 +111,6 @@ struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>
 };
 }
 
-/** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
- *  to know the number of tiles on the x and y direction
- *
- * @param[in] in_dims          Spatial dimensions of the input tensor of convolution layer
- * @param[in] kernel_size      Kernel size
- * @param[in] output_tile_size Size of a single output tile
- * @param[in] conv_info        Convolution info (i.e. pad, stride,...)
- *
- * @return the number of output tiles along the x and y directions of size "output_tile_size"
- */
-inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
-{
-    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
-
-    // Clamp in case we provide paddings but we have 1D convolution
-    num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
-    num_tiles_y = std::min(num_tiles_y, static_cast<int>(in_dims.height));
-
-    return Size2D(num_tiles_x, num_tiles_y);
-}
-
 /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
  * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
  *
@@ -694,6 +672,44 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord);
  * @return The int conversion of the requested data layout index.
  */
 inline size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension);
+
+/** Calculate the normalization dimension index for a given normalization type
+ *
+ * @param[in] layout Data layout of the input and output tensor
+ * @param[in] info   Normalization info
+ *
+ * @return Normalization dimension index
+ */
+inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
+    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
+
+    return info.is_in_map() ? width_idx : channel_idx;
+}
+
+/** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
+ *  to know the number of tiles on the x and y direction
+ *
+ * @param[in] in_dims          Spatial dimensions of the input tensor of convolution layer
+ * @param[in] kernel_size      Kernel size
+ * @param[in] output_tile_size Size of a single output tile
+ * @param[in] conv_info        Convolution info (i.e. pad, stride,...)
+ *
+ * @return the number of output tiles along the x and y directions of size "output_tile_size"
+ */
+inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+{
+    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
+    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+
+    // Clamp in case we provide paddings but we have 1D convolution
+    num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
+    num_tiles_y = std::min(num_tiles_y, static_cast<int>(in_dims.height));
+
+    return Size2D(num_tiles_x, num_tiles_y);
+}
+
 } // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index e5516ba154..dbf26a423d 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -201,15 +201,8 @@ inline TensorShape compute_im2col_fc_shape(const ITensorInfo *input, const int n
 inline TensorShape compute_im2col_flatten_shape(const ITensorInfo *input)
 {
     // The output shape will be the flatten version of the input (i.e. [ width * height * channels, 1, 1, ... ] ). Used for FlattenLayer.
-
-    ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 3);
-
     TensorShape output_shape{ input->tensor_shape() };
-
-    const size_t flatten_shape = input->dimension(0) * input->dimension(1) * input->dimension(2);
-    output_shape.set(0, flatten_shape);
-    output_shape.remove_dimension(1);
-    output_shape.remove_dimension(1);
+    output_shape.collapse(3, 0);
 
     return output_shape;
 }
@@ -403,20 +396,25 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 }
 
 template <typename T>
-inline TensorShape get_shape_from_info(T *info)
+inline TensorShape extract_shape(T *data)
+{
+    return data->info()->tensor_shape();
+}
+
+inline TensorShape extract_shape(ITensorInfo *data)
 {
-    return info->info()->tensor_shape();
+    return data->tensor_shape();
 }
 
-inline TensorShape get_shape_from_info(ITensorInfo *info)
+inline TensorShape extract_shape(const TensorShape *data)
 {
-    return info->tensor_shape();
+    return *data;
 }
 
 template <typename T>
 inline TensorShape calculate_depth_concatenate_shape(const std::vector<T *> &inputs_vector)
 {
-    TensorShape out_shape = get_shape_from_info(inputs_vector[0]);
+    TensorShape out_shape = extract_shape(inputs_vector[0]);
 
     size_t max_x = 0;
     size_t max_y = 0;
@@ -425,7 +423,7 @@ inline TensorShape calculate_depth_concatenate_shape(const std::vector<T *> &inp
     for(const auto &tensor : inputs_vector)
     {
         ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-        const TensorShape shape = get_shape_from_info(tensor);
+        const TensorShape shape = extract_shape(tensor);
         max_x                   = std::max(shape.x(), max_x);
         max_y                   = std::max(shape.y(), max_y);
         depth += shape.z();
@@ -441,13 +439,13 @@ inline TensorShape calculate_depth_concatenate_shape(const std::vector<T *> &inp
 template <typename T>
 inline TensorShape calculate_width_concatenate_shape(const std::vector<T *> &inputs_vector)
 {
-    TensorShape out_shape = get_shape_from_info(inputs_vector[0]);
+    TensorShape out_shape = extract_shape(inputs_vector[0]);
 
     size_t width = 0;
     for(const auto &tensor : inputs_vector)
     {
         ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-        const TensorShape shape = get_shape_from_info(tensor);
+        const TensorShape shape = extract_shape(tensor);
         width += shape.x();
     }
 
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index 6359e05a63..191848c15f 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -128,8 +128,8 @@ public:
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                       Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
-                                       unsigned int num_groups = 1, ConvolutionMethod method = ConvolutionMethod::DEFAULT, FastMathHint fast_math_hint = FastMathHint::DISABLED,
+                                       Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info, unsigned int num_groups = 1,
+                                       ConvolutionMethod method = ConvolutionMethod::Default, FastMathHint fast_math_hint = FastMathHint::Disabled,
                                        ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
                                        const QuantizationInfo weights_quant_info = QuantizationInfo(),
                                        const QuantizationInfo out_quant_info     = QuantizationInfo());
@@ -155,10 +155,11 @@ public:
      * @param[in] g      Graph to add the node to
      * @param[in] params Common node parameters
      * @param[in] inputs Inputs to the depth concatenate layer node as a NodeID-Index pair
+     * @param[in] axis   Concatenation axis
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_depth_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs);
+    static NodeID add_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs, DataLayoutDimension axis);
     /** Adds a depth-wise convolution layer node to the graph
      *
      * @param[in] g                     Graph to add the node to
@@ -175,7 +176,7 @@ public:
      */
     static NodeID add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
                                                  Size2D kernel_spatial_extend, PadStrideInfo conv_info,
-                                                 DepthwiseConvolutionMethod method    = DepthwiseConvolutionMethod::DEFAULT,
+                                                 DepthwiseConvolutionMethod method    = DepthwiseConvolutionMethod::Default,
                                                  ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr, const QuantizationInfo quant_info = QuantizationInfo());
     /** Adds an element-wise layer node to the graph
      *
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index b5446c4a55..ad390ad760 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -51,6 +51,11 @@ public:
      * @param[in] n Node to visit.
      */
     virtual void visit(BatchNormalizationLayerNode &n) = 0;
+    /** Visit ConcatenateLayerNode.
+     *
+     * @param[in] n Node to visit.
+     */
+    virtual void visit(ConcatenateLayerNode &n) = 0;
     /** Visit ConstNode.
      *
      * @param[in] n Node to visit.
@@ -61,11 +66,6 @@ public:
      * @param[in] n Node to visit.
      */
     virtual void visit(ConvolutionLayerNode &n) = 0;
-    /** Visit DepthConcatenateLayerNode.
-     *
-     * @param[in] n Node to visit.
-     */
-    virtual void visit(DepthConcatenateLayerNode &n) = 0;
     /** Visit DepthwiseConvolutionLayerNode.
      *
      * @param[in] n Node to visit.
@@ -148,15 +148,15 @@ public:
     {
         default_visit();
     }
-    virtual void visit(ConstNode &n) override
+    virtual void visit(ConcatenateLayerNode &n) override
     {
         default_visit();
     }
-    virtual void visit(ConvolutionLayerNode &n) override
+    virtual void visit(ConstNode &n) override
     {
         default_visit();
     }
-    virtual void visit(DepthConcatenateLayerNode &n) override
+    virtual void visit(ConvolutionLayerNode &n) override
     {
         default_visit();
     }
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index c3601f2373..7c0bd8cfdd 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -71,15 +71,15 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::ChannelShuffleLayer:
             os << "ChannelShuffleLayer";
             break;
+        case NodeType::ConcatenateLayer:
+            os << "ConcatenateLayer";
+            break;
         case NodeType::ConvolutionLayer:
             os << "ConvolutionLayer";
             break;
         case NodeType::DeconvolutionLayer:
             os << "DeconvolutionLayer";
             break;
-        case NodeType::DepthConcatenateLayer:
-            os << "DepthConcatenateLayer";
-            break;
         case NodeType::DepthwiseConvolutionLayer:
             os << "DepthwiseConvolutionLayer";
             break;
@@ -134,14 +134,14 @@ inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &el
 {
     switch(eltwise_op)
     {
-        case EltwiseOperation::ADD:
-            os << "ADD";
+        case EltwiseOperation::Add:
+            os << "Add";
             break;
-        case EltwiseOperation::MUL:
-            os << "MUL";
+        case EltwiseOperation::Mul:
+            os << "Mul";
             break;
-        case EltwiseOperation::SUB:
-            os << "SUB";
+        case EltwiseOperation::Sub:
+            os << "Sub";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
@@ -155,17 +155,17 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &m
 {
     switch(method)
     {
-        case ConvolutionMethod::DEFAULT:
-            os << "DEFAULT";
+        case ConvolutionMethod::Default:
+            os << "Default";
             break;
-        case ConvolutionMethod::DIRECT:
-            os << "DIRECT";
+        case ConvolutionMethod::Direct:
+            os << "Direct";
             break;
         case ConvolutionMethod::GEMM:
             os << "GEMM";
             break;
-        case ConvolutionMethod::WINOGRAD:
-            os << "WINOGRAD";
+        case ConvolutionMethod::Winograd:
+            os << "Winograd";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
@@ -179,11 +179,11 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 {
     switch(hint)
     {
-        case FastMathHint::ENABLED:
-            os << "ENABLED";
+        case FastMathHint::Enabled:
+            os << "Enabled";
             break;
-        case FastMathHint::DISABLED:
-            os << "DISABLED";
+        case FastMathHint::Disabled:
+            os << "Disabled";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
@@ -197,14 +197,14 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolution
 {
     switch(method)
     {
-        case DepthwiseConvolutionMethod::DEFAULT:
+        case DepthwiseConvolutionMethod::Default:
             os << "DEFAULT";
             break;
         case DepthwiseConvolutionMethod::GEMV:
             os << "GEMV";
             break;
-        case DepthwiseConvolutionMethod::OPTIMIZED_3x3:
-            os << "OPTIMIZED_3x3";
+        case DepthwiseConvolutionMethod::Optimized3x3:
+            os << "Optimized3x3";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index c5b7fb1c51..f22f50ac82 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -96,33 +96,33 @@ enum class Target
 /** Supported Element-wise operations */
 enum class EltwiseOperation
 {
-    ADD, /**< Arithmetic addition */
-    SUB, /**< Arithmetic subtraction */
-    MUL  /**< Arithmetic multiplication */
+    Add, /**< Arithmetic addition */
+    Sub, /**< Arithmetic subtraction */
+    Mul  /**< Arithmetic multiplication */
 };
 
 /** Supported Convolution layer methods */
 enum class ConvolutionMethod
 {
-    DEFAULT, /**< Default approach using internal heuristics */
+    Default, /**< Default approach using internal heuristics */
     GEMM,    /**< GEMM based convolution */
-    DIRECT,  /**< Deep direct convolution */
-    WINOGRAD /**< Winograd based convolution */
+    Direct,  /**< Deep direct convolution */
+    Winograd /**< Winograd based convolution */
 };
 
 /** Supported Depthwise Convolution layer methods */
 enum class DepthwiseConvolutionMethod
 {
-    DEFAULT,       /**< Default approach using internal heuristics */
-    GEMV,          /**< Generic GEMV based depthwise convolution */
-    OPTIMIZED_3x3, /**< Optimized 3x3 direct depthwise convolution */
+    Default,      /**< Default approach using internal heuristics */
+    GEMV,         /**< Generic GEMV based depthwise convolution */
+    Optimized3x3, /**< Optimized 3x3 direct depthwise convolution */
 };
 
 /** Enable or disable fast math for Convolution layer */
 enum class FastMathHint
 {
-    ENABLED,  /**< Fast math enabled for Convolution layer */
-    DISABLED, /**< Fast math disabled for Convolution layer */
+    Enabled,  /**< Fast math enabled for Convolution layer */
+    Disabled, /**< Fast math disabled for Convolution layer */
 };
 
 /** Supported nodes */
@@ -131,9 +131,9 @@ enum class NodeType
     ActivationLayer,
     BatchNormalizationLayer,
     ChannelShuffleLayer,
+    ConcatenateLayer,
     ConvolutionLayer,
     DeconvolutionLayer,
-    DepthConcatenateLayer,
     DepthwiseConvolutionLayer,
     EltwiseLayer,
     FlattenLayer,
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 978d3bc1a8..172f00277e 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -192,6 +192,52 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
     return std::move(func);
 }
 
+/** Create a backend layer concatenate function
+ *
+ * @tparam ConcatenateLayerFunction Backend concatenate function
+ * @tparam TargetInfo               Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend concatenate layer function
+ */
+template <typename ConcatenateLayerFunction, typename TargetInfo>
+std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Return nullptr if depth concatenate is switched off
+    if(!node.is_enabled())
+    {
+        return nullptr;
+    }
+
+    // Extract IO and info
+    std::vector<typename TargetInfo::TensorType *> inputs;
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
+    }
+    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
+    const DataLayoutDimension        concat_axis = node.concatenation_axis();
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<ConcatenateLayerFunction>();
+    func->configure(inputs, output, concat_axis);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
+                               << " Target " << TargetInfo::TargetType
+                               << " Data Type: " << output->info()->data_type()
+                               << " Shape: " << output->info()->tensor_shape()
+                               << " Num Inputs: " << inputs.size()
+                               << " Axis: " << concat_axis
+                               << std::endl);
+
+    return std::move(func);
+}
+
 /** Create a backend convolution layer function
  *
  * @tparam ConvolutionLayerFunctions Backend convolution functions
@@ -220,20 +266,20 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
 
     const PadStrideInfo     conv_info      = node.convolution_info();
     const ConvolutionMethod conv_algorithm = node.convolution_method();
-    const bool              fast_math      = node.fast_math_hint() == FastMathHint::ENABLED;
+    const bool              fast_math      = node.fast_math_hint() == FastMathHint::Enabled;
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
 
-    if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+    if(conv_algorithm == ConvolutionMethod::Winograd)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
                                         std::string("WinogradConvolutionLayer"), mm,
                                         input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
     }
-    else if(conv_algorithm == ConvolutionMethod::DIRECT)
+    else if(conv_algorithm == ConvolutionMethod::Direct)
     {
         std::tie(func, func_name) = create_named_function<typename ConvolutionLayerFunctions::DirectConvolutionLayer>(
                                         std::string("DirectConvolutionLayer"),
@@ -308,50 +354,6 @@ std::unique_ptr<IFunction> create_deconvolution_layer(DeconvolutionLayerNode &no
     return func;
 }
 
-/** Create a backend layer depth concatenate function
- *
- * @tparam DepthConcatenateLayerFunction Backend depth concatenate function
- * @tparam TargetInfo                    Target-specific information
- *
- * @param[in] node Node to create the backend function for
- *
- * @return Backend depth concatenate layer function
- */
-template <typename DepthConcatenateLayerFunction, typename TargetInfo>
-std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
-
-    // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
-    {
-        return nullptr;
-    }
-
-    // Extract IO and info
-    std::vector<typename TargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
-    {
-        inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
-    }
-    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
-
-    // Create and configure function
-    auto func = support::cpp14::make_unique<DepthConcatenateLayerFunction>();
-    func->configure(inputs, output);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << std::endl);
-
-    return std::move(func);
-}
-
 /** Create a backend layer depth-wise convolution function
  *
  * @tparam DepthwiseConvolutionLayerFunctions Backend depthwise convolution function
@@ -383,7 +385,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     // Create and configure function (we assume that functions have been validated before creation)
     std::unique_ptr<IFunction> func;
     std::string                func_name;
-    if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+    if(dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3)
     {
         std::tie(func, func_name) = create_named_function<typename DepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
                                         std::string("DepthwiseConvolutionLayer3x3"),
@@ -435,19 +437,19 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == EltwiseOperation::ADD)
+    if(eltwise_op == EltwiseOperation::Add)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Addition>(
                                         std::string("ArithmeticAddition"),
                                         input1, input2, output, convert_policy);
     }
-    else if(eltwise_op == EltwiseOperation::SUB)
+    else if(eltwise_op == EltwiseOperation::Sub)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Subtraction>(
                                         std::string("ArithmeticSubtraction"),
                                         input1, input2, output, convert_policy);
     }
-    else if(eltwise_op == EltwiseOperation::MUL)
+    else if(eltwise_op == EltwiseOperation::Mul)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Multiplication>(
                                         std::string("PixelWiseMultiplication"),
@@ -487,11 +489,12 @@ std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
     typename TargetInfo::TensorType *input  = get_backing_tensor<TargetInfo>(node.input(0));
     typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
 
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
     // Create and configure function
     auto func = support::cpp14::make_unique<FlattenLayerFunction>();
     func->configure(input, output);
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
@@ -526,13 +529,14 @@ std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode
     typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
     const FullyConnectedLayerInfo    fc_info = node.info();
 
-    // Create and configure function
-    auto func = support::cpp14::make_unique<FullyConnectedLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
-    func->configure(input, weights, biases, output, fc_info);
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_ERROR_ON(weights == nullptr);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
+    // Create and configure function
+    auto func = support::cpp14::make_unique<FullyConnectedLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
+    func->configure(input, weights, biases, output, fc_info);
+
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
                                << " Target " << TargetInfo::TargetType
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 189fbdc9c7..ae52593b03 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -106,22 +106,22 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 
     const PadStrideInfo     conv_info      = node.convolution_info();
     const ConvolutionMethod conv_algorithm = node.convolution_method();
-    //const bool              fast_math      = node.fast_math_hint() == FastMathHint::ENABLED;      // FIXME (COMPMID-1138): uncomment once NEON and GLES support fast_math
+    const bool              fast_math      = node.fast_math_hint() == FastMathHint::Enabled;
 
     // Validate function
     Status status{};
     switch(conv_algorithm)
     {
-        case ConvolutionMethod::DIRECT:
+        case ConvolutionMethod::Direct:
             status = DirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
         case ConvolutionMethod::GEMM:
             status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
-        case ConvolutionMethod::WINOGRAD:
-            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info /*, fast_math*/);
+        case ConvolutionMethod::Winograd:
+            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
             break;
-        case ConvolutionMethod::DEFAULT:
+        case ConvolutionMethod::Default:
             status = ConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
         default:
@@ -136,7 +136,7 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
                                        << node.id() << " and Name: " << node.name() << std::endl);
-            node.set_convolution_method(ConvolutionMethod::DEFAULT);
+            node.set_convolution_method(ConvolutionMethod::Default);
         }
     }
 
@@ -166,11 +166,11 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     // TODO (geopin01) : Switch when validation is implemented
     // Validate function
-    if((dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3) && (weights->tensor_shape()[get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH)] != 3))
+    if((dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3) && (weights->tensor_shape()[get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH)] != 3))
     {
         ARM_COMPUTE_LOG_GRAPH_INFO("Switched DepthwiseConvolutionLayer method of node with ID : "
                                    << node.id() << " and Name: " << node.name() << std::endl);
-        node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::DEFAULT);
+        node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
     }
 
     return Status{};
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index 197d2ea409..02ef56952d 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -595,7 +595,7 @@ public:
         }
         else if(_branch_merge_method == BranchMergeMethod::DEPTH_CONCATENATE)
         {
-            // Collect tail nodes and perform DepthConcatenate
+            // Collect tail nodes and concatenate
             std::vector<NodeIdxPair> nodes;
             for(auto &ss : _sub_streams)
             {
@@ -608,14 +608,14 @@ public:
                     }
                 }
             }
-            nid = GraphBuilder::add_depth_concatenate_node(s.graph(), common_params, nodes);
+            nid = GraphBuilder::add_concatenate_node(s.graph(), common_params, nodes, DataLayoutDimension::CHANNEL);
         }
         else
         {
             ARM_COMPUTE_ERROR_ON(_sub_streams.size() != 2);
             NodeIdxPair input0 = { _sub_streams[0]->tail_node(), 0 };
             NodeIdxPair input1 = { _sub_streams[1]->tail_node(), 0 };
-            nid                = GraphBuilder::add_elementwise_node(s.graph(), common_params, input0, input1, EltwiseOperation::ADD);
+            nid                = GraphBuilder::add_elementwise_node(s.graph(), common_params, input0, input1, EltwiseOperation::Add);
         }
         return nid;
     }
diff --git a/arm_compute/graph/frontend/Types.h b/arm_compute/graph/frontend/Types.h
index cd579e2119..f9d4952765 100644
--- a/arm_compute/graph/frontend/Types.h
+++ b/arm_compute/graph/frontend/Types.h
@@ -64,9 +64,9 @@ enum class BranchMergeMethod
 struct StreamHints
 {
     Target                     target_hint                       = { Target::UNSPECIFIED };                 /**< Target execution hint */
-    ConvolutionMethod          convolution_method_hint           = { ConvolutionMethod::DEFAULT };          /**< Convolution method hint */
-    DepthwiseConvolutionMethod depthwise_convolution_method_hint = { DepthwiseConvolutionMethod::DEFAULT }; /**< Depthwise Convolution method hint */
-    FastMathHint               fast_math_hint                    = { FastMathHint::DISABLED };              /**< Fast math hint */
+    ConvolutionMethod          convolution_method_hint           = { ConvolutionMethod::Default };          /**< Convolution method hint */
+    DepthwiseConvolutionMethod depthwise_convolution_method_hint = { DepthwiseConvolutionMethod::Default }; /**< Depthwise Convolution method hint */
+    FastMathHint               fast_math_hint                    = { FastMathHint::Disabled };              /**< Fast math hint */
 };
 } // namespace frontend
 } // namespace graph
diff --git a/arm_compute/graph/nodes/ConcatenateLayerNode.h b/arm_compute/graph/nodes/ConcatenateLayerNode.h
new file mode 100644
index 0000000000..20c8523752
--- /dev/null
+++ b/arm_compute/graph/nodes/ConcatenateLayerNode.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_CONCATENATE_LAYER_NODE_H__
+#define __ARM_COMPUTE_GRAPH_CONCATENATE_LAYER_NODE_H__
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Concatenation Layer node */
+class ConcatenateLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] total_nodes Number of nodes that will get concatenated
+     * @param[in] axis        Concatenation axis
+     */
+    ConcatenateLayerNode(unsigned int total_nodes, DataLayoutDimension axis);
+    /** Computes concatenations output descriptor
+     *
+     * @param[in] input_descriptors Input descriptors
+     * @param[in] axis              Concatenation axis
+     *
+     * @return Expected output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors, DataLayoutDimension axis);
+    /** Disables or not the depth concatenate node
+     *
+     * @warning This is used when concatenate is performed using sub-tensors, where this node is used as a placeholder.
+     *
+     * @param[in] is_enabled If true a backend function is created to perform the concatenation (involves copying),
+     *                       while if false, no function is created and we assume that sub-tensors are properly set to simulate
+     *                       a zero copy operation.
+     */
+    void set_enabled(bool is_enabled);
+    /** Enabled parameter accessor
+     *
+     * @return True if a backend function is to be created else false
+     */
+    bool is_enabled() const;
+    /** Concatenation axis parameter accessor
+     *
+     * @return Concatenation axis
+     */
+    DataLayoutDimension concatenation_axis() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    unsigned int        _total_nodes;
+    DataLayoutDimension _axis;
+    bool                _is_enabled;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_CONCATENATE_LAYER_NODE_H__ */
diff --git a/arm_compute/graph/nodes/ConvolutionLayerNode.h b/arm_compute/graph/nodes/ConvolutionLayerNode.h
index aca60283d7..4299be6bb5 100644
--- a/arm_compute/graph/nodes/ConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/ConvolutionLayerNode.h
@@ -41,8 +41,10 @@ public:
      * @param[in] fast_math_hint (Optional) Fast math hint
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ConvolutionLayerNode(PadStrideInfo info, ConvolutionMethod method = ConvolutionMethod::DEFAULT, FastMathHint fast_math_hint = FastMathHint::DISABLED,
-                         QuantizationInfo out_quant_info = QuantizationInfo());
+    ConvolutionLayerNode(PadStrideInfo     info,
+                         ConvolutionMethod method         = ConvolutionMethod::Default,
+                         FastMathHint      fast_math_hint = FastMathHint::Disabled,
+                         QuantizationInfo  out_quant_info = QuantizationInfo());
     /** Sets the convolution layer method to use
      *
      * @param[in] method Method to use for convolution
diff --git a/arm_compute/graph/nodes/DepthConcatenateLayerNode.h b/arm_compute/graph/nodes/DepthConcatenateLayerNode.h
deleted file mode 100644
index ffdec709ef..0000000000
--- a/arm_compute/graph/nodes/DepthConcatenateLayerNode.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_GRAPH_DEPTH_CONCATENATE_LAYER_NODE_H__
-#define __ARM_COMPUTE_GRAPH_DEPTH_CONCATENATE_LAYER_NODE_H__
-
-#include "arm_compute/graph/INode.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Depth Concatenation Layer node */
-class DepthConcatenateLayerNode final : public INode
-{
-public:
-    /** Constructor
-     *
-     * @param[in] total_nodes Number of nodes that will get concatenated
-     */
-    DepthConcatenateLayerNode(unsigned int total_nodes);
-    /** Computes depth concatenations output descriptor
-     *
-     * @param[in] input_descriptors Input descriptors
-     *
-     * @return Expected output descriptor
-     */
-    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors);
-    /** Disables or not the depth concatenate node
-     *
-     * @warning This is used when depth concatenate is performed with sub-tensors,
-     *          where this node is used as a placeholder.
-     *
-     * @param[in] is_enabled If true a backend function is created to perform the depth concatenation (involves copying),
-     *                       while if false, no function is created and we assume that subtensors are properly set to simulate
-     *                       a no copy operation.
-     */
-    void set_enabled(bool is_enabled);
-    /** Enabled parameter accessor
-     *
-     * @return True if a backend function is to be created else false
-     */
-    bool is_enabled() const;
-
-    // Inherited overridden methods:
-    NodeType         type() const override;
-    bool             forward_descriptors() override;
-    TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
-
-private:
-    unsigned int _total_nodes;
-    bool         _is_enabled;
-};
-} // namespace graph
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_GRAPH_DEPTH_CONCATENATE_LAYER_NODE_H__ */
diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
index df6f456ac9..1a173c5421 100644
--- a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
@@ -39,7 +39,7 @@ public:
      * @param[in] info   Convolution layer attributes
      * @param[in] method Depthwise convolution method to use
      */
-    DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method = DepthwiseConvolutionMethod::DEFAULT);
+    DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method = DepthwiseConvolutionMethod::Default);
     /** Sets the depthwise convolution method to use
      *
      * @param[in] method Depthwise convolution method to use
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index 97aa191916..f2e751e15f 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -27,10 +27,10 @@
 #include "arm_compute/graph/nodes/ActivationLayerNode.h"
 #include "arm_compute/graph/nodes/BatchNormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/ChannelShuffleLayerNode.h"
+#include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 #include "arm_compute/graph/nodes/ConstNode.h"
 #include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DeconvolutionLayerNode.h"
-#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DummyNode.h"
 #include "arm_compute/graph/nodes/EltwiseLayerNode.h"
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index 05979d796c..a0a9146dc4 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -33,10 +33,10 @@ class INode;
 class ActivationLayerNode;
 class BatchNormalizationLayerNode;
 class ChannelShuffleLayerNode;
+class ConcatenateLayerNode;
 class ConstNode;
 class ConvolutionLayerNode;
 class DeconvolutionLayerNode;
-class DepthConcatenateLayerNode;
 class DepthwiseConvolutionLayerNode;
 class DummyNode;
 class EltwiseLayerNode;
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 1d355a52ee..d4cf6928e5 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -52,8 +52,8 @@ public:
     // Inherited methods overridden
     void visit(ActivationLayerNode &n) override;
     void visit(BatchNormalizationLayerNode &n) override;
+    void visit(ConcatenateLayerNode &n) override;
     void visit(ConvolutionLayerNode &n) override;
-    void visit(DepthConcatenateLayerNode &n) override;
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(EltwiseLayerNode &n) override;
     void visit(NormalizationLayerNode &n) override;
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 63e7b16128..944a435c3b 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -72,14 +71,20 @@ public:
         const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb);
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(227U, 227U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(227U, 227U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor)))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               // Layer 1
               << ConvolutionLayer(
                   11U, 11U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_b.npy"),
                   PadStrideInfo(4, 4, 0, 0))
               .set_name("conv1")
@@ -89,7 +94,7 @@ public:
               // Layer 2
               << ConvolutionLayer(
                   5U, 5U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_b.npy"),
                   PadStrideInfo(1, 1, 2, 2), 2)
               .set_name("conv2")
@@ -99,7 +104,7 @@ public:
               // Layer 3
               << ConvolutionLayer(
                   3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_b.npy"),
                   PadStrideInfo(1, 1, 1, 1))
               .set_name("conv3")
@@ -107,7 +112,7 @@ public:
               // Layer 4
               << ConvolutionLayer(
                   3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_b.npy"),
                   PadStrideInfo(1, 1, 1, 1), 2)
               .set_name("conv4")
@@ -115,7 +120,7 @@ public:
               // Layer 5
               << ConvolutionLayer(
                   3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_b.npy"),
                   PadStrideInfo(1, 1, 1, 1), 2)
               .set_name("conv5")
@@ -124,21 +129,21 @@ public:
               // Layer 6
               << FullyConnectedLayer(
                   4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_b.npy"))
               .set_name("fc6")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu6")
               // Layer 7
               << FullyConnectedLayer(
                   4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_b.npy"))
               .set_name("fc7")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu7")
               // Layer 8
               << FullyConnectedLayer(
                   1000U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_b.npy"))
               .set_name("fc8")
               // Softmax
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 4497dbd470..d5bd0c0552 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -72,13 +71,19 @@ public:
         const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb);
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor)))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
                   7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_b.npy"),
                   PadStrideInfo(2, 2, 3, 3))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
@@ -86,33 +91,33 @@ public:
               << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
               << ConvolutionLayer(
                   1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << ConvolutionLayer(
                   3U, 3U, 192U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_b.npy"),
                   PadStrideInfo(1, 1, 1, 1))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        graph << get_inception_node(data_path, "inception_3a", 64, std::make_tuple(96U, 128U), std::make_tuple(16U, 32U), 32U);
-        graph << get_inception_node(data_path, "inception_3b", 128, std::make_tuple(128U, 192U), std::make_tuple(32U, 96U), 64U);
+        graph << get_inception_node(data_path, "inception_3a", weights_layout, 64, std::make_tuple(96U, 128U), std::make_tuple(16U, 32U), 32U);
+        graph << get_inception_node(data_path, "inception_3b", weights_layout, 128, std::make_tuple(128U, 192U), std::make_tuple(32U, 96U), 64U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        graph << get_inception_node(data_path, "inception_4a", 192, std::make_tuple(96U, 208U), std::make_tuple(16U, 48U), 64U);
-        graph << get_inception_node(data_path, "inception_4b", 160, std::make_tuple(112U, 224U), std::make_tuple(24U, 64U), 64U);
-        graph << get_inception_node(data_path, "inception_4c", 128, std::make_tuple(128U, 256U), std::make_tuple(24U, 64U), 64U);
-        graph << get_inception_node(data_path, "inception_4d", 112, std::make_tuple(144U, 288U), std::make_tuple(32U, 64U), 64U);
-        graph << get_inception_node(data_path, "inception_4e", 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U);
+        graph << get_inception_node(data_path, "inception_4a", weights_layout, 192, std::make_tuple(96U, 208U), std::make_tuple(16U, 48U), 64U);
+        graph << get_inception_node(data_path, "inception_4b", weights_layout, 160, std::make_tuple(112U, 224U), std::make_tuple(24U, 64U), 64U);
+        graph << get_inception_node(data_path, "inception_4c", weights_layout, 128, std::make_tuple(128U, 256U), std::make_tuple(24U, 64U), 64U);
+        graph << get_inception_node(data_path, "inception_4d", weights_layout, 112, std::make_tuple(144U, 288U), std::make_tuple(32U, 64U), 64U);
+        graph << get_inception_node(data_path, "inception_4e", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        graph << get_inception_node(data_path, "inception_5a", 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U);
-        graph << get_inception_node(data_path, "inception_5b", 384, std::make_tuple(192U, 384U), std::make_tuple(48U, 128U), 128U);
+        graph << get_inception_node(data_path, "inception_5a", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U);
+        graph << get_inception_node(data_path, "inception_5b", weights_layout, 384, std::make_tuple(192U, 384U), std::make_tuple(48U, 128U), 128U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 7, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
               << FullyConnectedLayer(
                   1000U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_b.npy"))
               << SoftmaxLayer()
               << OutputLayer(get_output_accessor(common_params, 5));
@@ -139,7 +144,7 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    BranchLayer get_inception_node(const std::string &data_path, std::string &&param_path,
+    BranchLayer get_inception_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
                                    unsigned int a_filt,
                                    std::tuple<unsigned int, unsigned int> b_filters,
                                    std::tuple<unsigned int, unsigned int> c_filters,
@@ -149,7 +154,7 @@ private:
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "1x1_w.npy"),
+                get_weights_accessor(data_path, total_path + "1x1_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "1x1_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
@@ -157,13 +162,13 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy"),
+                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "3x3_reduce_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(
                 3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_w.npy"),
+                get_weights_accessor(data_path, total_path + "3x3_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "3x3_b.npy"),
                 PadStrideInfo(1, 1, 1, 1))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
@@ -171,13 +176,13 @@ private:
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy"),
+                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "5x5_reduce_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(
                 5U, 5U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_w.npy"),
+                get_weights_accessor(data_path, total_path + "5x5_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "5x5_b.npy"),
                 PadStrideInfo(1, 1, 2, 2))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
@@ -186,7 +191,7 @@ private:
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)))
             << ConvolutionLayer(
                 1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "pool_proj_w.npy"),
+                get_weights_accessor(data_path, total_path + "pool_proj_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "pool_proj_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
index 67f4e3cacf..c41b0a808e 100644
--- a/examples/graph_inception_v3.cpp
+++ b/examples/graph_inception_v3.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -71,12 +70,18 @@ public:
         // Create a preprocessor object
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<TFPreproccessor>();
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(299U, 299U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor), false))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
               .set_name("Conv2d_1a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path,
@@ -89,7 +94,7 @@ public:
               .set_name("Conv2d_1a_3x3/BatchNorm/batchnorm")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
               << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
               .set_name("Conv2d_2a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path,
@@ -103,7 +108,7 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
 
               << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
               .set_name("Conv2d_2b_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path,
@@ -119,7 +124,7 @@ public:
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_3a_3x3/MaxPool")
 
               << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
               .set_name("Conv2d_3b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path,
@@ -133,7 +138,7 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
 
               << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
               .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path,
@@ -148,45 +153,45 @@ public:
 
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_5a_3x3/MaxPool");
 
-        graph << get_inception_node_A(data_path, "Mixed_5b", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+        graph << get_inception_node_A(data_path, "Mixed_5b", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
                                       32U)
               .set_name("Mixed_5b/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5c", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+        graph << get_inception_node_A(data_path, "Mixed_5c", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
                                       64U, true)
               .set_name("Mixed_5c/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5d", 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
+        graph << get_inception_node_A(data_path, "Mixed_5d", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
                                       64U)
               .set_name("Mixed_5d/concat");
 
-        graph << get_inception_node_B(data_path, "Mixed_6a", 384U, std::make_tuple(64U, 96U, 96U)).set_name("Mixed_6a/concat");
+        graph << get_inception_node_B(data_path, "Mixed_6a", weights_layout, 384U, std::make_tuple(64U, 96U, 96U)).set_name("Mixed_6a/concat");
 
-        graph << get_inception_node_C(data_path, "Mixed_6b", 192U, std::make_tuple(128U, 128U, 192U),
+        graph << get_inception_node_C(data_path, "Mixed_6b", weights_layout, 192U, std::make_tuple(128U, 128U, 192U),
                                       std::make_tuple(128U, 128U, 128U, 128U, 192U), 192U)
               .set_name("Mixed_6b/concat");
-        graph << get_inception_node_C(data_path, "Mixed_6c", 192U, std::make_tuple(160U, 160U, 192U),
+        graph << get_inception_node_C(data_path, "Mixed_6c", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
               .set_name("Mixed_6c/concat");
-        graph << get_inception_node_C(data_path, "Mixed_6d", 192U, std::make_tuple(160U, 160U, 192U),
+        graph << get_inception_node_C(data_path, "Mixed_6d", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
               .set_name("Mixed_6d/concat");
-        graph << get_inception_node_C(data_path, "Mixed_6e", 192U, std::make_tuple(192U, 192U, 192U),
+        graph << get_inception_node_C(data_path, "Mixed_6e", weights_layout, 192U, std::make_tuple(192U, 192U, 192U),
                                       std::make_tuple(192U, 192U, 192U, 192U, 192U), 192U)
               .set_name("Mixed_6e/concat");
 
-        graph << get_inception_node_D(data_path, "Mixed_7a", std::make_tuple(192U, 320U),
+        graph << get_inception_node_D(data_path, "Mixed_7a", weights_layout, std::make_tuple(192U, 320U),
                                       std::make_tuple(192U, 192U, 192U, 192U))
               .set_name("Mixed_7a/concat");
 
-        graph << get_inception_node_E(data_path, "Mixed_7b", 320U, std::make_tuple(384U, 384U, 384U),
+        graph << get_inception_node_E(data_path, "Mixed_7b", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U)
               .set_name("Mixed_7b/concat");
-        graph << get_inception_node_E(data_path, "Mixed_7c", 320U, std::make_tuple(384U, 384U, 384U),
+        graph << get_inception_node_E(data_path, "Mixed_7c", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U, true)
               .set_name("Mixed_7c/concat");
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))).set_name("Logits/AvgPool_1a_8x8/AvgPool")
               << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path,
-                                                                      "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy"),
+                                                                      "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy", weights_layout),
                                   get_weights_accessor(data_path,
                                                        "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0))
@@ -218,7 +223,7 @@ private:
     Stream             graph;
 
 private:
-    BranchLayer get_inception_node_A(const std::string &data_path, std::string &&param_path,
+    BranchLayer get_inception_node_A(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
                                      unsigned int a_filt,
                                      std::tuple<unsigned int, unsigned int> b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int> c_filters,
@@ -239,7 +244,7 @@ private:
         SubStream i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
@@ -255,7 +260,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/convolution")
@@ -269,7 +274,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/Relu")
             << ConvolutionLayer(
                 5U, 5U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 2, 2))
             .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/convolution")
@@ -285,7 +290,7 @@ private:
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
@@ -299,7 +304,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 1, 1))
             .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
@@ -313,7 +318,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 1, 1))
             .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/convolution")
@@ -330,7 +335,7 @@ private:
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true)).set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
                 1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
@@ -346,7 +351,7 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    BranchLayer get_inception_node_B(const std::string &data_path, std::string &&param_path,
+    BranchLayer get_inception_node_B(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
                                      unsigned int a_filt,
                                      std::tuple<unsigned int, unsigned int, unsigned int> b_filters)
     {
@@ -354,7 +359,7 @@ private:
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 3U, 3U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(2, 2, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/convolution")
@@ -370,7 +375,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
@@ -384,7 +389,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 1, 1))
             .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/convolution")
@@ -398,7 +403,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(2, 2, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/convolution")
@@ -417,7 +422,7 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    BranchLayer get_inception_node_C(const std::string &data_path, std::string &&param_path,
+    BranchLayer get_inception_node_C(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
                                      unsigned int a_filt,
                                      std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
@@ -427,7 +432,7 @@ private:
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
@@ -443,7 +448,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
@@ -457,7 +462,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 3, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
@@ -471,7 +476,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
                 1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 3))
             .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
@@ -487,7 +492,7 @@ private:
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
@@ -501,7 +506,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 1U, 7U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 3))
             .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/convolution")
@@ -515,7 +520,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
             << ConvolutionLayer(
                 7U, 1U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 3, 0))
             .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/convolution")
@@ -529,7 +534,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
             << ConvolutionLayer(
                 1U, 7U, std::get<3>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 3))
             .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/convolution")
@@ -543,7 +548,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
             << ConvolutionLayer(
                 7U, 1U, std::get<4>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 3, 0))
             .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/convolution")
@@ -560,7 +565,7 @@ private:
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true)).set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
                 1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
@@ -576,15 +581,15 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    BranchLayer get_inception_node_D(const std::string &data_path, std::string &&param_path,
-                                     std::tuple<unsigned int, unsigned int>      a_filters,
+    BranchLayer get_inception_node_D(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
+                                     std::tuple<unsigned int, unsigned int> a_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> b_filters)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, std::get<0>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
@@ -598,7 +603,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<1>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(2, 2, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/convolution")
@@ -614,7 +619,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
@@ -628,7 +633,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 3, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
@@ -642,7 +647,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
                 1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 3))
             .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
@@ -656,7 +661,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<3>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(2, 2, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/convolution")
@@ -675,7 +680,7 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    BranchLayer get_inception_node_E(const std::string &data_path, std::string &&param_path,
+    BranchLayer get_inception_node_E(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
                                      unsigned int a_filt,
                                      std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
@@ -693,7 +698,7 @@ private:
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
@@ -709,7 +714,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
@@ -725,7 +730,7 @@ private:
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
                  3U, 1U, std::get<1>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 1, 0))
              .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/convolution")
@@ -741,7 +746,7 @@ private:
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
                  1U, 3U, std::get<2>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 0, 1))
              .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/convolution")
@@ -760,7 +765,7 @@ private:
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
@@ -774,7 +779,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
                 3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 1, 1))
             .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
@@ -790,7 +795,7 @@ private:
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
                  3U, 1U, std::get<2>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 1, 0))
              .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/convolution")
@@ -806,7 +811,7 @@ private:
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
                  1U, 3U, std::get<3>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 0, 1))
              .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/convolution")
@@ -826,7 +831,7 @@ private:
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true)).set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
                 1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index 4e405923fc..b61acfcb3f 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -71,13 +70,19 @@ public:
         // Create a preprocessor object
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<TFPreproccessor>();
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(299U, 299U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor), false))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
               << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
               << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -87,7 +92,7 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               // Conv2d_2a_3x3
               << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
               << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
@@ -97,7 +102,7 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               // Conv2d_2b_3x3
               << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy"),
+                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy", weights_layout),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
               << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
@@ -106,35 +111,35 @@ public:
                                          0.001f)
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        graph << get_mixed_3a(data_path);
-        graph << get_mixed_4a(data_path);
-        graph << get_mixed_5a(data_path);
+        graph << get_mixed_3a(data_path, weights_layout);
+        graph << get_mixed_4a(data_path, weights_layout);
+        graph << get_mixed_5a(data_path, weights_layout);
         // 4 inception A blocks
-        graph << get_inceptionA_block(data_path, "Mixed_5b");
-        graph << get_inceptionA_block(data_path, "Mixed_5c");
-        graph << get_inceptionA_block(data_path, "Mixed_5d");
-        graph << get_inceptionA_block(data_path, "Mixed_5e");
+        graph << get_inceptionA_block(data_path, weights_layout, "Mixed_5b");
+        graph << get_inceptionA_block(data_path, weights_layout, "Mixed_5c");
+        graph << get_inceptionA_block(data_path, weights_layout, "Mixed_5d");
+        graph << get_inceptionA_block(data_path, weights_layout, "Mixed_5e");
         // reduction A block
-        graph << get_reductionA_block(data_path);
+        graph << get_reductionA_block(data_path, weights_layout);
         // 7 inception B blocks
-        graph << get_inceptionB_block(data_path, "Mixed_6b");
-        graph << get_inceptionB_block(data_path, "Mixed_6c");
-        graph << get_inceptionB_block(data_path, "Mixed_6d");
-        graph << get_inceptionB_block(data_path, "Mixed_6e");
-        graph << get_inceptionB_block(data_path, "Mixed_6f");
-        graph << get_inceptionB_block(data_path, "Mixed_6g");
-        graph << get_inceptionB_block(data_path, "Mixed_6h");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6b");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6c");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6d");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6e");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6f");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6g");
+        graph << get_inceptionB_block(data_path, weights_layout, "Mixed_6h");
         // reduction B block
-        graph << get_reductionB_block(data_path);
+        graph << get_reductionB_block(data_path, weights_layout);
         // 3 inception C blocks
-        graph << get_inceptionC_block(data_path, "Mixed_7b");
-        graph << get_inceptionC_block(data_path, "Mixed_7c");
-        graph << get_inceptionC_block(data_path, "Mixed_7d");
+        graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7b");
+        graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7c");
+        graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7d");
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG))
               << FlattenLayer()
               << FullyConnectedLayer(
                   1001U,
-                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
               << SoftmaxLayer()
               << OutputLayer(get_output_accessor(common_params, 5));
@@ -162,7 +167,7 @@ private:
     Stream             graph;
 
 private:
-    BranchLayer get_mixed_3a(const std::string &data_path)
+    BranchLayer get_mixed_3a(const std::string &data_path, DataLayout weights_layout)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_3a_";
 
@@ -171,7 +176,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_variance.npy"),
@@ -183,13 +188,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b));
     }
 
-    BranchLayer get_mixed_4a(const std::string &data_path)
+    BranchLayer get_mixed_4a(const std::string &data_path, DataLayout weights_layout)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_4a_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -198,7 +203,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -209,7 +214,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -218,7 +223,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(7U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
@@ -227,7 +232,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(1U, 7U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
@@ -236,7 +241,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -248,13 +253,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b));
     }
 
-    BranchLayer get_mixed_5a(const std::string &data_path)
+    BranchLayer get_mixed_5a(const std::string &data_path, DataLayout weights_layout)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_5a_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -269,13 +274,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b));
     }
 
-    BranchLayer get_inceptionA_block(const std::string &data_path, std::string &&param_path)
+    BranchLayer get_inceptionA_block(const std::string &data_path, DataLayout weights_layout, std::string &&param_path)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -286,7 +291,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -295,7 +300,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
@@ -306,7 +311,7 @@ private:
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -315,7 +320,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
@@ -324,7 +329,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
@@ -336,7 +341,7 @@ private:
         SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
@@ -348,13 +353,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    BranchLayer get_reductionA_block(const std::string &data_path)
+    BranchLayer get_reductionA_block(const std::string &data_path, DataLayout weights_layout)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_6a_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -365,7 +370,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -374,7 +379,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
@@ -383,7 +388,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -398,13 +403,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    BranchLayer get_inceptionB_block(const std::string &data_path, std::string &&param_path)
+    BranchLayer get_inceptionB_block(const std::string &data_path, DataLayout weights_layout, std::string &&param_path)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -415,7 +420,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -424,7 +429,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
@@ -433,7 +438,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(1U, 7U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
@@ -444,7 +449,7 @@ private:
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -453,7 +458,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(1U, 7U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
@@ -462,7 +467,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
@@ -471,7 +476,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(1U, 7U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
@@ -480,7 +485,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
@@ -492,7 +497,7 @@ private:
         SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 128U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
@@ -504,13 +509,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    BranchLayer get_reductionB_block(const std::string &data_path)
+    BranchLayer get_reductionB_block(const std::string &data_path, DataLayout weights_layout)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_7a_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -519,7 +524,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -530,7 +535,7 @@ private:
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -539,7 +544,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
@@ -548,7 +553,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(1U, 7U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
@@ -557,7 +562,7 @@ private:
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(3U, 3U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
@@ -572,13 +577,13 @@ private:
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    BranchLayer get_inceptionC_block(const std::string &data_path, std::string &&param_path)
+    BranchLayer get_inceptionC_block(const std::string &data_path, DataLayout weights_layout, std::string &&param_path)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
@@ -590,7 +595,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(
@@ -604,7 +609,7 @@ private:
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
                  3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 1, 0))
              << BatchNormalizationLayer(
@@ -618,7 +623,7 @@ private:
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
                  1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 0, 1))
              << BatchNormalizationLayer(
@@ -635,7 +640,7 @@ private:
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(
@@ -647,7 +652,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(
                 1U, 3U, 448U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 0, 1))
             << BatchNormalizationLayer(
@@ -659,7 +664,7 @@ private:
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
             << ConvolutionLayer(
                 3U, 1U, 512U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                 PadStrideInfo(1, 1, 1, 0))
             << BatchNormalizationLayer(
@@ -673,7 +678,7 @@ private:
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
                  3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 1, 0))
              << BatchNormalizationLayer(
@@ -687,7 +692,7 @@ private:
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
                  1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy", weights_layout),
                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                  PadStrideInfo(1, 1, 0, 1))
              << BatchNormalizationLayer(
@@ -704,7 +709,7 @@ private:
         SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
+                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
                                        get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index a747b3cd11..d182844530 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -85,7 +85,7 @@ public:
 
         // Set graph hints
         graph << common_params.target
-              << DepthwiseConvolutionMethod::OPTIMIZED_3x3 // FIXME(COMPMID-1073): Add heuristics to automatically call the optimized 3x3 method
+              << DepthwiseConvolutionMethod::Optimized3x3 // FIXME(COMPMID-1073): Add heuristics to automatically call the optimized 3x3 method
               << common_params.fast_math_hint;
 
         // Create core graph
diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
index 58f36f6ae4..0ad719a2ca 100644
--- a/examples/graph_resnet50.cpp
+++ b/examples/graph_resnet50.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -72,13 +71,20 @@ public:
         const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb,
                                                                                                                    false /* Do not convert to BGR */);
+
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
               << ConvolutionLayer(
                   7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy", weights_layout),
                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                   PadStrideInfo(2, 2, 3, 3))
               .set_name("conv1/convolution")
@@ -92,15 +98,15 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool1/MaxPool");
 
-        add_residual_block(data_path, "block1", 64, 3, 2);
-        add_residual_block(data_path, "block2", 128, 4, 2);
-        add_residual_block(data_path, "block3", 256, 6, 2);
-        add_residual_block(data_path, "block4", 512, 3, 1);
+        add_residual_block(data_path, "block1", weights_layout, 64, 3, 2);
+        add_residual_block(data_path, "block2", weights_layout, 128, 4, 2);
+        add_residual_block(data_path, "block3", weights_layout, 256, 6, 2);
+        add_residual_block(data_path, "block4", weights_layout, 512, 3, 1);
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)).set_name("pool5")
               << ConvolutionLayer(
                   1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               .set_name("logits/convolution")
@@ -129,7 +135,8 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, const std::string &name, unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    void add_residual_block(const std::string &data_path, const std::string &name, DataLayout weights_layout,
+                            unsigned int base_depth, unsigned int num_units, unsigned int stride)
     {
         for(unsigned int i = 0; i < num_units; ++i)
         {
@@ -151,7 +158,7 @@ private:
             SubStream right(graph);
             right << ConvolutionLayer(
                       1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                       PadStrideInfo(1, 1, 0, 0))
                   .set_name(unit_name + "conv1/convolution")
@@ -166,7 +173,7 @@ private:
 
                   << ConvolutionLayer(
                       3U, 3U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                       PadStrideInfo(middle_stride, middle_stride, 1, 1))
                   .set_name(unit_name + "conv2/convolution")
@@ -181,7 +188,7 @@ private:
 
                   << ConvolutionLayer(
                       1U, 1U, base_depth * 4,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                       PadStrideInfo(1, 1, 0, 0))
                   .set_name(unit_name + "conv3/convolution")
@@ -198,7 +205,7 @@ private:
                 SubStream left(graph);
                 left << ConvolutionLayer(
                          1U, 1U, base_depth * 4,
-                         get_weights_accessor(data_path, unit_path + "shortcut_weights.npy"),
+                         get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
                          std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                          PadStrideInfo(1, 1, 0, 0))
                      .set_name(unit_name + "shortcut/convolution")
diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
index c0e9b9f22a..e7ef013f17 100644
--- a/examples/graph_resnext50.cpp
+++ b/examples/graph_resnext50.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -68,26 +67,32 @@ public:
         // Get trainable parameters data path
         std::string data_path = common_params.data_path;
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params))
+              << InputLayer(input_descriptor, get_input_accessor(common_params))
               << ScaleLayer(get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_mul.npy"),
                             get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_add.npy"))
               .set_name("bn_data/Scale")
               << ConvolutionLayer(
                   7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
                   PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
               .set_name("conv0/Convolution")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool0");
 
-        add_residual_block(data_path, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3, /*stride_conv_unit1*/ 1);
-        add_residual_block(data_path, 512, 2, 4, 2);
-        add_residual_block(data_path, 1024, 3, 6, 2);
-        add_residual_block(data_path, 2048, 4, 3, 2);
+        add_residual_block(data_path, weights_layout, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3, /*stride_conv_unit1*/ 1);
+        add_residual_block(data_path, weights_layout, 512, 2, 4, 2);
+        add_residual_block(data_path, weights_layout, 1024, 3, 6, 2);
+        add_residual_block(data_path, weights_layout, 2048, 4, 3, 2);
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)).set_name("pool1")
               << FlattenLayer().set_name("predictions/Reshape")
@@ -116,7 +121,8 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, unsigned int base_depth, unsigned int stage, unsigned int num_units, unsigned int stride_conv_unit1)
+    void add_residual_block(const std::string &data_path, DataLayout weights_layout,
+                            unsigned int base_depth, unsigned int stage, unsigned int num_units, unsigned int stride_conv_unit1)
     {
         for(unsigned int i = 0; i < num_units; ++i)
         {
@@ -137,7 +143,7 @@ private:
             SubStream right(graph);
             right << ConvolutionLayer(
                       1U, 1U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
                       get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
                       PadStrideInfo(1, 1, 0, 0))
                   .set_name(unit_name + "conv1/convolution")
@@ -145,7 +151,7 @@ private:
 
                   << ConvolutionLayer(
                       3U, 3U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                       pad_grouped_conv, 32)
                   .set_name(unit_name + "conv2/convolution")
@@ -156,7 +162,7 @@ private:
 
                   << ConvolutionLayer(
                       1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
                       get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
                       PadStrideInfo(1, 1, 0, 0))
                   .set_name(unit_name + "conv3/convolution");
@@ -166,7 +172,7 @@ private:
             {
                 left << ConvolutionLayer(
                          1U, 1U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "sc_weights.npy"),
+                         get_weights_accessor(data_path, unit_path + "sc_weights.npy", weights_layout),
                          std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                          PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
                      .set_name(unit_name + "sc/convolution")
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index 9439ab4343..b539a9bc34 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -72,78 +71,84 @@ public:
         const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb);
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor)))
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
                   7U, 7U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
                   PadStrideInfo(2, 2, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
                   1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire2", 64U, 64U);
+        graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U);
         graph << ConvolutionLayer(
                   1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire3", 64U, 64U);
+        graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U);
         graph << ConvolutionLayer(
                   1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire4", 128U, 128U);
+        graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
                   1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire5", 128U, 128U);
+        graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U);
         graph << ConvolutionLayer(
                   1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire6", 192U, 192U);
+        graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U);
         graph << ConvolutionLayer(
                   1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire7", 192U, 192U);
+        graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U);
         graph << ConvolutionLayer(
                   1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire8", 256U, 256U);
+        graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
                   1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire9", 256U, 256U);
+        graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U);
         graph << ConvolutionLayer(
                   1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
@@ -174,13 +179,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    BranchLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, unsigned int expand1_filt, unsigned int expand3_filt)
+    BranchLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
+                                     unsigned int expand1_filt, unsigned int expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1.0_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy"),
+                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
@@ -188,7 +194,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy"),
+                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
                 PadStrideInfo(1, 1, 1, 1))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp
index ba7ee774a7..c0b5ff212d 100644
--- a/examples/graph_squeezenet_v1_1.cpp
+++ b/examples/graph_squeezenet_v1_1.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -72,80 +71,86 @@ public:
         const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb);
 
+        // Create input descriptor
+        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+
+        // Set weights trained layout
+        const DataLayout weights_layout = DataLayout::NCHW;
+
         graph << common_params.target
               << common_params.fast_math_hint
-              << InputLayer(TensorDescriptor(TensorShape(227U, 227U, 3U, 1U), common_params.data_type),
-                            get_input_accessor(common_params, std::move(preprocessor)))
-              << ConvolutionMethod::DIRECT
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+              << ConvolutionMethod::Direct
               << ConvolutionLayer(
                   3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_b.npy"),
                   PadStrideInfo(2, 2, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
-              << ConvolutionMethod::DEFAULT
+              << ConvolutionMethod::Default
               << ConvolutionLayer(
                   1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire2", 64U, 64U);
+        graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U);
         graph << ConvolutionLayer(
                   1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire3", 64U, 64U);
+        graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
                   1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire4", 128U, 128U);
+        graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U);
         graph << ConvolutionLayer(
                   1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire5", 128U, 128U);
+        graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U);
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
                   1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire6", 192U, 192U);
+        graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U);
         graph << ConvolutionLayer(
                   1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire7", 192U, 192U);
+        graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U);
         graph << ConvolutionLayer(
                   1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire8", 256U, 256U);
+        graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U);
         graph << ConvolutionLayer(
                   1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-        graph << get_expand_fire_node(data_path, "fire9", 256U, 256U);
+        graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U);
         graph << ConvolutionLayer(
                   1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy", weights_layout),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_b.npy"),
                   PadStrideInfo(1, 1, 0, 0))
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
@@ -176,13 +181,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    BranchLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, unsigned int expand1_filt, unsigned int expand3_filt)
+    BranchLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
+                                     unsigned int expand1_filt, unsigned int expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1_1_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
                 1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy"),
+                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
                 PadStrideInfo(1, 1, 0, 0))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
@@ -190,7 +196,7 @@ private:
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
                 3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy"),
+                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
                 get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
                 PadStrideInfo(1, 1, 1, 1))
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index c38a78ce3e..080835348d 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -549,10 +549,10 @@ __kernel void pooling_layer_MxN_nhwc(
 
     for(int y = 0; y < POOL_SIZE_Y; ++y)
     {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
         for(int x = 0; x < POOL_SIZE_X; ++x)
         {
-            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
+            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
             x1     = select(x1, PAD_X - idx_width - 1, y != y1);
 
             VEC_DATA_TYPE(DATA_TYPE, 8)
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index b1290b8edd..a09129bba6 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -288,7 +288,6 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_layout() != DataLayout::NCHW, "Special case Im2Col output layout is NCHW");
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
 
     _input       = input;
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index df01eab240..edc9e9d58c 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -42,6 +42,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
+                                    "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
@@ -59,14 +61,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int norm_size = norm_info.norm_size();
-    bool               is_in_map = norm_info.is_in_map();
+    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
+    const unsigned int norm_size             = norm_info.norm_size();
+    bool               is_norm_accross_width = norm_idx == 0;
 
-    const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+    const unsigned int border_width = is_norm_accross_width ? std::min(norm_size / 2, 3U) : 0;
     const BorderSize   border_size  = BorderSize(0, border_width);
 
     const unsigned int num_elems_processed_per_iteration = 4;
-    const unsigned int num_elems_read_per_iteration      = is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
+    const unsigned int num_elems_read_per_iteration      = is_norm_accross_width ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
 
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
@@ -84,7 +87,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
+    : _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
 {
 }
 
@@ -106,8 +109,9 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     _input  = input;
     _output = output;
 
-    _is_in_map                      = norm_info.is_in_map();
-    const unsigned int border_width = _is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    const unsigned int norm_idx     = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
+    _is_norm_across_width           = norm_idx == 0;
+    const unsigned int border_width = _is_norm_across_width ? std::min(norm_info.norm_size() / 2, 3U) : 0;
     _border_size                    = BorderSize(0, border_width);
 
     const unsigned int num_elems_processed_per_iteration = 4;
@@ -125,7 +129,7 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
 
     // Create kernel
-    std::string kernel_name = _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+    std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
@@ -159,7 +163,7 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+    const int collapsed_dimension = _is_norm_across_width ? Window::DimZ : 4;
     Window    window_collapsed    = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
     Window    slice               = window_collapsed.first_slice_window_3D();
 
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 246ab68130..d5ea092c78 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -154,7 +154,9 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
             num_elems_processed_per_iteration = 8;
             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-            AccessWindowRectangle  input_access(input, 0, -pool_pad_left, num_elems_processed_per_iteration, pool_size_x);
+            AccessWindowStatic input_access(input,
+                                            0, -1,
+                                            ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
             window_changed = update_window_and_padding(win, input_access, output_access);
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index cb1996f33e..15e8298e2d 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -43,6 +43,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
+                                    "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
@@ -61,8 +63,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 {
     unsigned int       num_elems_processed_per_iteration = 16 / input->element_size();
     const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int norm_idx                          = get_normalization_dimension_index(input->data_layout(), norm_info);
     const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
-    const unsigned int border_width                      = (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+    const unsigned int border_width                      = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
     BorderSize         border_size                       = BorderSize(0, border_width);
     bool               window_changed                    = false;
 
@@ -107,7 +110,8 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
 
-    const unsigned int border_width = (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+    const unsigned int norm_idx     = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
+    const unsigned int border_width = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
 
     _input         = input;
     _input_squared = input_squared;
@@ -119,16 +123,21 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     {
         case DataType::F32:
         {
-            switch(norm_info.type())
+            switch(norm_idx)
             {
-                case NormType::IN_MAP_1D:
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
-                    break;
-                case NormType::IN_MAP_2D:
-                    // Normalize over X and Y
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+                case 0:
+                {
+                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+                    }
+                    else
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+                    }
                     break;
-                case NormType::CROSS_MAP:
+                }
+                case 2:
                     _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
                     break;
                 default:
@@ -138,16 +147,21 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
         }
         case DataType::F16:
         {
-            switch(norm_info.type())
+            switch(norm_idx)
             {
-                case NormType::IN_MAP_1D:
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
-                    break;
-                case NormType::IN_MAP_2D:
-                    // Normalize over X and Y
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+                case 0:
+                {
+                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+                    }
+                    else
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+                    }
                     break;
-                case NormType::CROSS_MAP:
+                }
+                case 2:
                     _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
                     break;
                 default:
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index d26039ec35..b3721719d9 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -88,10 +88,14 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPai
     bool has_bias = (bias != EmptyNodeID);
 
     // Split input
-    NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, 2);
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL);
+    NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
-    NodeID weights_split = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, 3);
+    const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
+    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc, DataLayoutDimension::BATCHES);
+    NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
 
     // Split bias
     NodeID bias_split = EmptyNodeID;
@@ -122,7 +126,7 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPai
     }
 
     // Depth concatenate output
-    return GraphBuilder::add_depth_concatenate_node(g, params, convolution_outputs);
+    return GraphBuilder::add_concatenate_node(g, params, convolution_outputs, DataLayoutDimension::CHANNEL);
 }
 } // namespace
 
@@ -329,11 +333,11 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
     return deconv_nid;
 }
 
-NodeID GraphBuilder::add_depth_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs)
+NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs, DataLayoutDimension axis)
 {
     ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
 
-    NodeID nid = g.add_node<DepthConcatenateLayerNode>(inputs.size());
+    NodeID nid = g.add_node<ConcatenateLayerNode>(inputs.size(), axis);
 
     unsigned int i = 0;
     for(const auto &input : inputs)
@@ -508,9 +512,9 @@ NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdx
     NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
 
     // Create node and connect
-    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::MUL);
+    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
     NodeIdxPair mulnode_nidxp = { mul_node, 0 };
-    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::ADD);
+    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
 
     return add_node;
 }
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 4d6734846a..57871487ef 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -89,8 +89,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
             return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
-        case NodeType::DepthConcatenateLayer:
-            return detail::create_depth_concatenate_layer<CLDepthConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::ConcatenateLayer:
+            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::EltwiseLayer:
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index e6bd5a5f02..f72513c87c 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -68,6 +68,42 @@ struct GCEltwiseFunctions
 
 namespace detail
 {
+// Specialize functions
+template <>
+std::unique_ptr<IFunction> create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(ConcatenateLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Return nullptr if depth concatenate is switched off
+    if(!node.is_enabled())
+    {
+        return nullptr;
+    }
+
+    // Extract IO and info
+    std::vector<GCTargetInfo::TensorType *> inputs;
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        inputs.push_back(get_backing_tensor<GCTargetInfo>(node.input(i)));
+    }
+    typename GCTargetInfo::TensorType *output = get_backing_tensor<GCTargetInfo>(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCDepthConcatenateLayer>();
+    func->configure(inputs, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
+                               << " Target " << GCTargetInfo::TargetType
+                               << " Data Type: " << output->info()->data_type()
+                               << " Shape: " << output->info()->tensor_shape()
+                               << " Num Inputs: " << inputs.size()
+                               << std::endl);
+
+    return std::move(func);
+}
+
 template <>
 std::unique_ptr<IFunction> create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(ConvolutionLayerNode &node, GraphContext &ctx)
 {
@@ -92,7 +128,7 @@ std::unique_ptr<IFunction> create_convolution_layer<GCConvolutionLayerFunctions,
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
 
-    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    if(conv_algorithm == ConvolutionMethod::Direct)
     {
         std::tie(func, func_name) = create_named_function<GCConvolutionLayerFunctions::DirectConvolutionLayer>(
                                         std::string("DirectConvolutionLayer"),
@@ -139,7 +175,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer<GCDepthwiseConvolu
     // Create and configure function (we assume that functions have been validated before creation)
     std::unique_ptr<IFunction> func;
     std::string                func_name;
-    if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+    if(dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3)
     {
         std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
                                         std::string("DepthwiseConvolutionLayer3x3"),
@@ -183,17 +219,17 @@ std::unique_ptr<IFunction> create_eltwise_layer<GCEltwiseFunctions, GCTargetInfo
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == EltwiseOperation::ADD)
+    if(eltwise_op == EltwiseOperation::Add)
     {
         std::tie(func, func_name) = create_named_function<GCEltwiseFunctions::Addition>(
                                         std::string("GCArithmeticAddition"),
                                         input1, input2, output, convert_policy);
     }
-    else if(eltwise_op == EltwiseOperation::SUB)
+    else if(eltwise_op == EltwiseOperation::Sub)
     {
         ARM_COMPUTE_ERROR("Arithmetic subtraction is not supported in GLES backend");
     }
-    else if(eltwise_op == EltwiseOperation::MUL)
+    else if(eltwise_op == EltwiseOperation::Mul)
     {
         std::tie(func, func_name) = create_named_function<GCEltwiseFunctions::Multiplication>(
                                         std::string("PixelWiseMultiplication"),
@@ -232,8 +268,8 @@ std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &
             return detail::create_batch_normalization_layer<GCBatchNormalizationLayer, GCTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::ConvolutionLayer:
             return detail::create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
-        case NodeType::DepthConcatenateLayer:
-            return detail::create_depth_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::ConcatenateLayer:
+            return detail::create_concatenate_layer<GCDepthConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<GCDepthwiseConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::EltwiseLayer:
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 4bef89329a..8118a7c476 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -58,7 +58,7 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
     // TODO (geopin01) : Switch when validation is implemented
     // Validate function
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
-    node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::OPTIMIZED_3x3);
+    node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::Optimized3x3);
 
     return Status{};
 }
@@ -80,14 +80,14 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
     const ConvolutionMethod   conv_algorithm = node.convolution_method();
 
     // Validate function
-    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    if(conv_algorithm == ConvolutionMethod::Direct)
     {
         bool is_square         = weights->tensor_shape().x() == weights->tensor_shape().y();
         bool is_direct         = (weights->tensor_shape().x() == 1) || (weights->tensor_shape().x() == 3) || (weights->tensor_shape().x() == 5);
         bool is_correct_stride = (conv_info.stride().first) <= 2 && (conv_info.stride().second <= 2);
         if(!(is_square && is_direct && is_correct_stride))
         {
-            node.set_convolution_method(ConvolutionMethod::DEFAULT);
+            node.set_convolution_method(ConvolutionMethod::Default);
         }
     }
 
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 3b7417da3f..6c912a02f1 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -102,7 +102,7 @@ std::unique_ptr<IFunction> create_convolution_layer<NEConvolutionLayerFunctions,
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
-    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    if(conv_algorithm == ConvolutionMethod::Direct)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(
                                         std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info);
@@ -112,7 +112,7 @@ std::unique_ptr<IFunction> create_convolution_layer<NEConvolutionLayerFunctions,
         std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(
                                         std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info);
     }
-    else if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+    else if(conv_algorithm == ConvolutionMethod::Winograd)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(
                                         std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info);
@@ -183,8 +183,8 @@ std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &
             return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
             return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
-        case NodeType::DepthConcatenateLayer:
-            return detail::create_depth_concatenate_layer<NEDepthConcatenateLayer, NETargetInfo>(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::ConcatenateLayer:
+            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::EltwiseLayer:
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index c56f4c5106..241c07b367 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
-#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
+#include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/Iterable.h"
@@ -45,11 +46,18 @@ void DepthConcatSubTensorMutator::mutate(Graph &g)
     // Should be in reverse order of execution
     for(auto &node : arm_compute::utils::iterable::reverse_iterate(g.nodes()))
     {
-        if(node && node->type() == NodeType::DepthConcatenateLayer && node->output(0) != nullptr)
+        if(node && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
         {
             // Get output tensor
             auto output_tensor = node->output(0);
 
+            // Check concatenation axis (Sub-tensor optimization is support for concatenation axis >=2)
+            auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node.get());
+            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc(), concat_node->concatenation_axis()) < 2)
+            {
+                continue;
+            }
+
             // Check that all tensor have the same target and valid inputs
             bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
                                         [&](const EdgeID & eid)
@@ -76,7 +84,7 @@ void DepthConcatSubTensorMutator::mutate(Graph &g)
                     depth += input_shape.z();
                 }
 
-                auto *dc_node = arm_compute::utils::cast::polymorphic_downcast<DepthConcatenateLayerNode *>(node.get());
+                auto *dc_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node.get());
                 dc_node->set_enabled(false);
             }
         }
diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
new file mode 100644
index 0000000000..ade3f6e1a9
--- /dev/null
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, DataLayoutDimension axis)
+    : _total_nodes(total_nodes), _axis(axis), _is_enabled(true)
+{
+    _input_edges.resize(_total_nodes, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void ConcatenateLayerNode::set_enabled(bool is_enabled)
+{
+    _is_enabled = is_enabled;
+}
+
+bool ConcatenateLayerNode::is_enabled() const
+{
+    return _is_enabled;
+}
+
+DataLayoutDimension ConcatenateLayerNode::concatenation_axis() const
+{
+    return _axis;
+}
+
+TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+                                                                 DataLayoutDimension                  axis)
+{
+    ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
+
+    TensorDescriptor output_descriptor = input_descriptors[0];
+    const int        axis_idx          = get_dimension_idx(output_descriptor, axis);
+
+    // Extract shapes
+    std::vector<const TensorShape *> shapes;
+    for(auto &input_descriptor : input_descriptors)
+    {
+        shapes.emplace_back(&input_descriptor.shape);
+    }
+
+    // Calculate output shape
+    if(axis_idx == 0)
+    {
+        output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(shapes);
+    }
+    else if(axis_idx == 2)
+    {
+        output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(shapes);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported concatenation axis!");
+    }
+
+    return output_descriptor;
+}
+
+bool ConcatenateLayerNode::forward_descriptors()
+{
+    if(_outputs[0] != NullTensorID)
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ConcatenateLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    // Check if all input tensors are set
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
+    {
+        return eid != EmptyEdgeID;
+    });
+
+    TensorDescriptor output_info = {};
+
+    if(are_all_inputs_set)
+    {
+        std::vector<TensorDescriptor> inputs_descriptors;
+        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        {
+            const Tensor *t = _graph->tensor(input_id(i));
+            ARM_COMPUTE_ERROR_ON(t == nullptr);
+            inputs_descriptors.push_back(t->desc());
+        }
+        output_info = compute_output_descriptor(inputs_descriptors, _axis);
+    }
+
+    return output_info;
+}
+
+NodeType ConcatenateLayerNode::type() const
+{
+    return NodeType::ConcatenateLayer;
+}
+
+void ConcatenateLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/DepthConcatenateLayerNode.cpp b/src/graph/nodes/DepthConcatenateLayerNode.cpp
deleted file mode 100644
index 08cccc1ff1..0000000000
--- a/src/graph/nodes/DepthConcatenateLayerNode.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-DepthConcatenateLayerNode::DepthConcatenateLayerNode(unsigned int total_nodes)
-    : _total_nodes(total_nodes), _is_enabled(true)
-{
-    _input_edges.resize(_total_nodes, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void DepthConcatenateLayerNode::set_enabled(bool is_enabled)
-{
-    _is_enabled = is_enabled;
-}
-
-bool DepthConcatenateLayerNode::is_enabled() const
-{
-    return _is_enabled;
-}
-
-TensorDescriptor DepthConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors)
-{
-    ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
-
-    TensorDescriptor output_descriptor = input_descriptors[0];
-
-    size_t max_x = 0;
-    size_t max_y = 0;
-    size_t depth = 0;
-
-    for(const auto &input_descriptor : input_descriptors)
-    {
-        max_x = std::max(input_descriptor.shape.x(), max_x);
-        max_y = std::max(input_descriptor.shape.y(), max_y);
-        depth += input_descriptor.shape.z();
-    }
-
-    output_descriptor.shape.set(0, max_x);
-    output_descriptor.shape.set(1, max_y);
-    output_descriptor.shape.set(2, depth);
-
-    return output_descriptor;
-}
-
-bool DepthConcatenateLayerNode::forward_descriptors()
-{
-    if(_outputs[0] != NullTensorID)
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor DepthConcatenateLayerNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
-
-    // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
-
-    TensorDescriptor output_info = {};
-
-    if(are_all_inputs_set)
-    {
-        std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
-        {
-            const Tensor *t = _graph->tensor(input_id(i));
-            ARM_COMPUTE_ERROR_ON(t == nullptr);
-            inputs_descriptors.push_back(t->desc());
-        }
-        output_info = compute_output_descriptor(inputs_descriptors);
-    }
-
-    return output_info;
-}
-
-NodeType DepthConcatenateLayerNode::type() const
-{
-    return NodeType::DepthConcatenateLayer;
-}
-
-void DepthConcatenateLayerNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 61cf42356f..ef156ea252 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -47,17 +47,19 @@ void DotGraphVisitor::visit(BatchNormalizationLayerNode &n)
     _info = ss.str();
 }
 
-void DotGraphVisitor::visit(ConvolutionLayerNode &n)
+void DotGraphVisitor::visit(ConcatenateLayerNode &n)
 {
     std::stringstream ss;
-    ss << n.convolution_method();
+    ss << "Enabled: " << n.is_enabled();
+    ss << R"( \n )";
+    ss << "Axis: " << n.concatenation_axis();
     _info = ss.str();
 }
 
-void DotGraphVisitor::visit(DepthConcatenateLayerNode &n)
+void DotGraphVisitor::visit(ConvolutionLayerNode &n)
 {
     std::stringstream ss;
-    ss << "Enabled: " << n.is_enabled();
+    ss << n.convolution_method();
     _info = ss.str();
 }
 
diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index 7ac7bbce2b..2e5d787448 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp
@@ -84,7 +84,7 @@ namespace utils
     os << "Data layout : " << common_params.data_layout << std::endl;
     os << "Tuner enabled? : " << (common_params.enable_tuner ? true_str : false_str) << std::endl;
     os << "Tuner file : " << common_params.tuner_file << std::endl;
-    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::ENABLED ? true_str : false_str) << std::endl;
+    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str) << std::endl;
     if(!common_params.data_path.empty())
     {
         os << "Data path : " << common_params.data_path << std::endl;
@@ -168,7 +168,7 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
 
 CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
 {
-    FastMathHint fast_math_hint_value = options.fast_math_hint->value() ? FastMathHint::ENABLED : FastMathHint::DISABLED;
+    FastMathHint fast_math_hint_value = options.fast_math_hint->value() ? FastMathHint::Enabled : FastMathHint::Disabled;
     auto         validation_range     = parse_validation_range(options.validation_range->value());
 
     CommonGraphParams common_params;
@@ -178,7 +178,7 @@ CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
     common_params.data_type              = options.data_type->value();
     common_params.data_layout            = options.data_layout->value();
     common_params.enable_tuner           = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
-    common_params.fast_math_hint         = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::DISABLED;
+    common_params.fast_math_hint         = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
     common_params.data_path              = options.data_path->value();
     common_params.image                  = options.image->value();
     common_params.labels                 = options.labels->value();
diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index ce4a2c9dd1..23c3cc7c30 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h
@@ -93,7 +93,7 @@ struct CommonGraphParams
     arm_compute::DataType            data_type{ DataType::F32 };
     arm_compute::DataLayout          data_layout{ DataLayout::NCHW };
     bool                             enable_tuner{ false };
-    arm_compute::graph::FastMathHint fast_math_hint{ arm_compute::graph::FastMathHint::DISABLED };
+    arm_compute::graph::FastMathHint fast_math_hint{ arm_compute::graph::FastMathHint::Disabled };
     std::string                      data_path{};
     std::string                      image{};
     std::string                      labels{};
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index eb03ff9b96..8bfdba9725 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -475,6 +475,35 @@ inline std::string to_string(const arm_compute::DataLayout &data_layout)
     return str.str();
 }
 
+/** Formatted output of the DataLayoutDimension type.
+ *
+ * @param[out] os              Output stream.
+ * @param[in]  data_layout_dim Data layout dimension to print.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const DataLayoutDimension &data_layout_dim)
+{
+    switch(data_layout_dim)
+    {
+        case DataLayoutDimension::WIDTH:
+            os << "WIDTH";
+            break;
+        case DataLayoutDimension::HEIGHT:
+            os << "HEIGHT";
+            break;
+        case DataLayoutDimension::CHANNEL:
+            os << "CHANNEL";
+            break;
+        case DataLayoutDimension::BATCHES:
+            os << "BATCHES";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return os;
+}
+
 /** Formatted output of the DataType type.
  *
  * @param[out] os        Output stream.
-- 
cgit v1.2.1