9 files changed, 698 insertions, 261 deletions
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index 30d6700446..1b7ee3c4a4 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,7 @@ IGraphMutator::MutationType DepthConcatSubTensorMutator::type() const
 void DepthConcatSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Concatenation layers exist in graph
-    if(g.nodes(NodeType::ConcatenateLayer).empty())
+    if (g.nodes(NodeType::ConcatenateLayer).empty())
     {
         return;
     }
@@ -59,43 +59,48 @@ void DepthConcatSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
         {
             // Get output tensor
             auto output_tensor = node->output(0);
 
             // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
             auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
-            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
+            if (output_tensor == nullptr ||
+                get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
             {
                 continue;
             }
 
             // Check that all tensor have the same target, valid inputs and same quantization info
-            bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
-                                        [&](const EdgeID & eid)
-            {
-                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
-                       && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
-            });
+            bool is_valid =
+                std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+                            [&](const EdgeID &eid)
+                            {
+                                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) &&
+                                       (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) &&
+                                       (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
+                            });
 
             // Create subtensors
-            if(is_valid && is_target_supported(output_tensor->desc().target))
+            if (is_valid && is_target_supported(output_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
                 // Create sub-tensor handles
                 unsigned depth = 0;
-                for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+                for (unsigned int i = 0; i < node->input_edges().size(); ++i)
                 {
                     auto       input_tensor = node->input(i);
                     const auto input_shape  = input_tensor->desc().shape;
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
                     input_tensor->set_handle(std::move(handle));
 
                     depth += input_shape.z();
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index f8494a872f..31efba6bb1 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,14 @@
  */
 #include "arm_compute/graph/mutators/GroupedConvolutionMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 #include <set>
@@ -42,43 +41,51 @@ namespace graph
 {
 namespace
 {
-NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
-                                  PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
+NodeID create_grouped_convolution(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  NodeID              weights,
+                                  NodeID              bias,
+                                  PadStrideInfo       conv_info,
+                                  ConvolutionMethod   method,
+                                  ActivationLayerInfo fused_act,
+                                  FastMathHint        fast_math_hint,
+                                  unsigned int        num_groups)
 {
     bool has_bias = (bias != EmptyNodeID);
 
     // Split input
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
-    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
-    NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
+    const unsigned int     input_idx   = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
+    NodeID                 input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
     const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
-    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
-    NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
+    const unsigned int     batch_idx     = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
+    NodeID                 weights_split = GraphBuilder::add_split_node(g, params, {weights, 0}, num_groups, batch_idx);
 
     // Split bias
     NodeID bias_split = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         // Split bias
-        bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+        bias_split = GraphBuilder::add_split_node(g, params, {bias, 0}, num_groups, 0);
     }
 
     std::vector<NodeIdxPair> convolution_outputs;
-    for(unsigned int i = 0; i < num_groups; ++i)
+    for (unsigned int i = 0; i < num_groups; ++i)
     {
         NodeParams group_params = params;
         NodeID     conv_nid     = g.add_node<ConvolutionLayerNode>(conv_info, 1, method, fast_math_hint);
         g.add_connection(input_split, i, conv_nid, 0);
         g.add_connection(weights_split, i, conv_nid, 1);
-        if(has_bias)
+        if (has_bias)
         {
             g.add_connection(bias_split, i, conv_nid, 2);
         }
 
         // Add group name
-        if(!group_params.name.empty())
+        if (!group_params.name.empty())
         {
             group_params.name.append("_g" + arm_compute::support::cpp11::to_string(i));
         }
@@ -92,7 +99,7 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPai
         auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
         conv_node->set_fused_activation(fused_act);
 
-        convolution_outputs.push_back({ conv_nid, 0 });
+        convolution_outputs.push_back({conv_nid, 0});
     }
 
     // Depth concatenate output
@@ -113,7 +120,7 @@ IGraphMutator::MutationType GroupedConvolutionMutator::type() const
 void GroupedConvolutionMutator::mutate(Graph &g)
 {
     // Early exit if no Convolution layers exist in graph
-    if(g.nodes(NodeType::ConvolutionLayer).empty())
+    if (g.nodes(NodeType::ConvolutionLayer).empty())
     {
         return;
     }
@@ -122,17 +129,18 @@ void GroupedConvolutionMutator::mutate(Graph &g)
     size_t total_nodes = g.nodes().size();
 
     // Iterate over convolution nodes
-    for(unsigned int i = 0; i < total_nodes; ++i)
+    for (unsigned int i = 0; i < total_nodes; ++i)
     {
         INode *node = g.node(i);
-        if(node != nullptr && node->type() == NodeType::ConvolutionLayer && arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
+        if (node != nullptr && node->type() == NodeType::ConvolutionLayer &&
+            arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // If grouped convolution is not supported
-            if(!bool(status))
+            if (!bool(status))
             {
                 // Down-cast node
                 auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
@@ -151,7 +159,8 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
                 const NodeID input_id   = conv_node->input_edge(0)->producer()->id();
                 const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
-                const NodeID bias_id    = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
+                const NodeID bias_id =
+                    (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
 
                 // Get driving nodes
                 std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -164,14 +173,15 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 NodeID   latest_nid = g.nodes().size();
 
                 // Create grouped convolution node
-                NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
-                                                                    conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
+                NodeID grouped_conv_id =
+                    create_grouped_convolution(g, params, {input_id, 0}, weights_id, bias_id, conv_info, conv_method,
+                                               fused_act_info, fast_math_hint, num_groups);
 
                 // Remove convolution node
                 g.remove_node(node->id());
 
                 // Update batch normalization node outputs
-                for(auto &driving_node : driving_nodes)
+                for (auto &driving_node : driving_nodes)
                 {
                     g.add_connection(grouped_conv_id, 0, driving_node.node_id, driving_node.index);
                 }
@@ -180,17 +190,16 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 g.node(grouped_conv_id)->output(0)->set_accessor(std::move(node_accessor));
 
                 // Configure new tensors and nodes
-                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), [](std::unique_ptr<Tensor> &t)
-                {
-                    configure_tensor(t.get());
-                });
-                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), [&assigned_target](std::unique_ptr<INode> &n)
-                {
-                    if(n != nullptr)
-                    {
-                        n->set_assigned_target(assigned_target);
-                    }
-                });
+                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(),
+                              [](std::unique_ptr<Tensor> &t) { configure_tensor(t.get()); });
+                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(),
+                              [&assigned_target](std::unique_ptr<INode> &n)
+                              {
+                                  if (n != nullptr)
+                                  {
+                                      n->set_assigned_target(assigned_target);
+                                  }
+                              });
             }
         }
     }
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 3b06537cd9..a51dcc4f42 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,193 @@
  */
 #include "arm_compute/graph/mutators/InPlaceOperationMutator.h"
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+
+#include "support/Cast.h"
+
+using namespace arm_compute::utils::cast;
 
 namespace arm_compute
 {
 namespace graph
 {
+namespace
+{
+// Check if the output edges of the parent node are separate tensors. If not,
+// it means the same output is connected to multiple nodes and computations on
+// these nodes cannot be done in-place.
+bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
+{
+    const auto parent_node   = input_edge->producer();
+    const auto input_tensor  = input_edge->tensor();
+    const auto input_edge_id = input_edge->id();
+
+    if (parent_node == nullptr)
+    {
+        return false;
+    }
+
+    const auto output_edges = parent_node->output_edges();
+
+    // If the output is connected to only one edge, then computations can
+    // be done in-place.
+    if (output_edges.size() == 1)
+    {
+        return true;
+    }
+
+    return std::all_of(output_edges.begin(), output_edges.end(),
+                       [&](const EdgeID &edge_id)
+                       {
+                           // Skip check on current input edge
+                           if (edge_id == input_edge_id)
+                           {
+                               return true;
+                           }
+
+                           auto edge = g.edge(edge_id);
+                           return edge->tensor() != input_tensor;
+                       });
+}
+
+// If do in-place calculation, then need to use the new output and inherit original output's accessor
+void set_new_output_and_inherit_accessor(std::unique_ptr<INode> &node, Tensor *orig_output, Tensor *new_output)
+{
+    ARM_COMPUTE_LOG_GRAPH_INFO("Switching to in-place computation for the node with ID : "
+                               << node->id() << " and name : " << node->name() << std::endl);
+    // Update accessor
+    new_output->set_accessor(orig_output->extract_accessor());
+    // Update output
+    node->set_output_tensor(new_output->id(), 0);
+}
+
+// Try to mutate the node to perform the depthwise in-place calculation
+void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
+{
+    // Get input edge
+    Edge *input_edge  = node->input_edge(0);
+    Edge *weight_edge = node->input_edge(1);
+    ARM_COMPUTE_ERROR_ON(input_edge == nullptr || weight_edge == nullptr);
+
+    auto input_tensor  = input_edge->tensor();
+    auto weight_tensor = weight_edge->tensor();
+    ARM_COMPUTE_ERROR_ON(input_tensor == nullptr || weight_tensor == nullptr);
+
+    const auto input_shape = input_tensor->desc().shape;
+    const auto qinfo_input = input_tensor->desc().quant_info;
+
+    const auto weight_shape  = weight_tensor->desc().shape;
+    const auto weight_layout = weight_tensor->desc().layout;
+
+    // Extract PadStrideInfo and depth multiplier
+    PadStrideInfo conv_info{};
+    unsigned int  depth_multiplier{};
+    if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+    {
+        conv_info =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+        depth_multiplier =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+    }
+    else if (node->type() == NodeType::DepthwiseConvolutionLayer)
+    {
+        conv_info        = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
+        depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
+    }
+
+    // Get current output tensor
+    auto current_output_tensor = node->output(0);
+    ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr);
+    const auto out_shape = current_output_tensor->desc().shape;
+    const auto qinfo_out = current_output_tensor->desc().quant_info;
+
+    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) &&
+                              (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+
+    // Specify conditions with which input can be in-placed
+    input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
+
+    const int  weights_width_idx  = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::WIDTH);
+    const int  weights_height_idx = get_data_layout_dimension_index(weight_layout, DataLayoutDimension::HEIGHT);
+    const bool is_1x1             = weight_shape[weights_width_idx] == 1U && weight_shape[weights_height_idx] == 1U;
+    input_can_in_place &= is_1x1;
+
+    input_can_in_place &= depth_multiplier == 1;
+    input_can_in_place &= conv_info.stride() == std::make_pair(1U, 1U);
+    input_can_in_place &= !conv_info.has_padding();
+    // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
+
+    if (input_can_in_place)
+    {
+        set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
+    }
+}
+
+// Try to mutate the node to perform the elementwise in-place calculation
+void try_in_place_elementwise(std::unique_ptr<INode> &node)
+{
+    // Get input edge
+    Edge *input0_edge = node->input_edge(0);
+    Edge *input1_edge = node->input_edge(1);
+    ARM_COMPUTE_ERROR_ON(input0_edge == nullptr || input1_edge == nullptr);
+
+    auto input0_tensor = input0_edge->tensor();
+    auto input1_tensor = input1_edge->tensor();
+    ARM_COMPUTE_ERROR_ON(input0_tensor == nullptr || input1_tensor == nullptr);
+
+    const auto shape0 = input0_tensor->desc().shape;
+    const auto shape1 = input1_tensor->desc().shape;
+    const auto qinfo0 = input0_tensor->desc().quant_info;
+    const auto qinfo1 = input1_tensor->desc().quant_info;
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+    // Inputs are not broadcast compatible
+    if (out_shape.total_size() == 0)
+    {
+        return;
+    }
+
+    // Get current output tensor
+    auto current_output_tensor = node->output(0);
+    ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr);
+    const auto qinfo_out = current_output_tensor->desc().quant_info;
+
+    // Can do in place, if the input has same shape as output, has same quntisation info as output, has same data type as output and input doesn't have accessor.
+    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) &&
+                               (qinfo0 == qinfo_out) &&
+                               (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input0_tensor->accessor() == nullptr);
+    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) &&
+                               (qinfo1 == qinfo_out) &&
+                               (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input1_tensor->accessor() == nullptr);
+
+    if (input0_can_in_place)
+    {
+        set_new_output_and_inherit_accessor(node, current_output_tensor, input0_tensor);
+    }
+    else if (input1_can_in_place)
+    {
+        set_new_output_and_inherit_accessor(node, current_output_tensor, input1_tensor);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
+    }
+}
+} // namespace
+
 const char *InPlaceOperationMutator::name()
 {
     return "InPlaceOperationMutator";
@@ -42,38 +222,53 @@ IGraphMutator::MutationType InPlaceOperationMutator::type() const
 
 void InPlaceOperationMutator::mutate(Graph &g)
 {
-    std::set<NodeType> in_place_nodes = { NodeType::BatchNormalizationLayer, NodeType::ActivationLayer, NodeType::PrintLayer };
+    std::set<NodeType> in_place_nodes = {NodeType::ActivationLayer,
+                                         NodeType::BatchNormalizationLayer,
+                                         NodeType::EltwiseLayer,
+                                         NodeType::UnaryEltwiseLayer,
+                                         NodeType::DepthwiseConvolutionLayer,
+                                         NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
+                                         NodeType::PrintLayer};
 
     // Not interested in the order of nodes
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+        if (node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
         {
             // Get input edge
             Edge *input_edge = node->input_edge(0);
 
             // Check if parent has a single output if yes then force in place calculation else not
-            if((input_edge != nullptr) && (input_edge->producer() != nullptr) && (input_edge->producer()->output_edges().size() == 1))
+            if ((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
             {
-                // Get current and new output tensors
-                auto current_output_tensor = node->output(0);
-                auto new_output_tensor     = input_edge->tensor();
-
-                ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
-
-                // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
-                if(new_output_tensor->accessor() != nullptr || current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
+                if (node->type() == NodeType::EltwiseLayer)
                 {
-                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+                    try_in_place_elementwise(node);
+                }
+                else if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer ||
+                         node->type() == NodeType::DepthwiseConvolutionLayer)
+                {
+                    try_in_place_depthwiseconv(node);
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Switching to in-place computation for the node with ID : "
-                                                  << node->id() << " and name : " << node->name() << std::endl);
-                    // Update accessor
-                    new_output_tensor->set_accessor(current_output_tensor->extract_accessor());
-                    // Update output
-                    node->set_output_tensor(new_output_tensor->id(), 0);
+                    // Get current and new output tensors
+                    auto current_output_tensor = node->output(0);
+                    auto new_output_tensor     = input_edge->tensor();
+
+                    ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
+
+                    // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
+                    if (new_output_tensor->accessor() != nullptr ||
+                        current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
+                    {
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to "
+                                                      "the input tensor or the quantization info are different.\n");
+                    }
+                    else
+                    {
+                        set_new_output_and_inherit_accessor(node, current_output_tensor, new_output_tensor);
+                    }
                 }
             }
         }
diff --git a/src/graph/mutators/MutatorUtils.cpp b/src/graph/mutators/MutatorUtils.cpp
new file mode 100644
index 0000000000..f47240eadd
--- /dev/null
+++ b/src/graph/mutators/MutatorUtils.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/graph/mutators/MutatorUtils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list)
+{
+    if (layout == DataLayout::NCHW || layout == DataLayout::NHWC)
+    {
+        const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT);
+        const unsigned int width_index  = get_dimension_idx(layout, DataLayoutDimension::WIDTH);
+
+        for (unsigned int i = 0; i < padding_list.size(); ++i)
+        {
+            if (i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
+            {
+                // if the index is not either height or width, don't fuse
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    return false;
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/mutators/MutatorUtils.h b/src/graph/mutators/MutatorUtils.h
new file mode 100644
index 0000000000..170d892c93
--- /dev/null
+++ b/src/graph/mutators/MutatorUtils.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_MUTATOR_UTILS_H
+#define ARM_COMPUTE_GRAPH_MUTATOR_UTILS_H
+
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Check if padding is in height and/or width dimensions
+ *
+ * @param[in] layout       Data layout of the tensor
+ * @param[in] padding_list List of padding pairs
+ */
+bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list);
+} // namespace graph
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_GRAPH_MUTATOR_UTILS_H */
+\ No newline at end of file
diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 72e2645dd2..588befecae 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/graph/mutators/NodeExecutionMethodMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
@@ -49,17 +49,17 @@ template <typename Setter>
 void set_default_on_invalid_method(Graph &g, NodeType node_type, Setter &&setter)
 {
     const std::vector<NodeID> &node_ids = g.nodes(node_type);
-    for(auto &node_id : node_ids)
+    for (auto &node_id : node_ids)
     {
         INode *node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // Set default execution method in case of failure
-            if(!bool(status))
+            if (!bool(status))
             {
                 setter(node);
             }
@@ -81,22 +81,26 @@ IGraphMutator::MutationType NodeExecutionMethodMutator::type() const
 void NodeExecutionMethodMutator::mutate(Graph &g)
 {
     // Convolution Layer
-    set_default_on_invalid_method(g, NodeType::ConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
-        casted_node->set_convolution_method(ConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(g, NodeType::ConvolutionLayer,
+                                  [](INode *n)
+                                  {
+                                      ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
+                                                                 << n->id() << " and Name: " << n->name() << std::endl);
+                                      auto *casted_node =
+                                          arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
+                                      casted_node->set_convolution_method(ConvolutionMethod::Default);
+                                  });
 
     // Depthwise Convolution Layer
-    set_default_on_invalid_method(g, NodeType::DepthwiseConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
-        casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(
+        g, NodeType::DepthwiseConvolutionLayer,
+        [](INode *n)
+        {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
+                                       << n->id() << " and Name: " << n->name() << std::endl);
+            auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
+            casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
+        });
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index ae53b8ff75..998a4a05c7 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,18 @@
  */
 #include "arm_compute/graph/mutators/NodeFusionMutator.h"
 
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/graph/mutators/MutatorUtils.h"
+#include "support/Cast.h"
 
+#include <list>
 #include <set>
 
 namespace arm_compute
@@ -40,24 +43,60 @@ namespace graph
 {
 namespace detail
 {
+void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode *old_node, bool add_output_tensor)
+{
+    if (new_node == nullptr || old_node == nullptr)
+    {
+        return;
+    }
+
+    // Get driving nodes of last fusable node
+    std::vector<NodeIdxPair> last_driving_nodes = get_driving_nodes(*old_node);
+
+    // Extract last fusable node accessor if any
+    if (old_node->output(0) == nullptr)
+    {
+        return;
+    }
+    auto old_node_accessor = old_node->output(0)->extract_accessor();
+
+    // Remove node
+    g.remove_node(old_node->id());
+
+    // Update fused node outputs
+    for (auto &driving_node : last_driving_nodes)
+    {
+        g.add_connection(new_node->id(), 0, driving_node.node_id, driving_node.index);
+        if (add_output_tensor)
+        {
+            configure_tensor(new_node->output(0));
+        }
+    }
+
+    // Update accessor to fused node
+    new_node->output(0)->set_accessor(std::move(old_node_accessor));
+}
+
 void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node   = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
     // Not fusing if number of groups is greater than 1
-    if(conv_node->num_groups() > 1)
+    if (conv_node->num_groups() > 1)
     {
         return;
     }
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
+    if (conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = conv_node->assigned_target();
 
@@ -77,9 +116,10 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         const auto epsilon = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
+        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
 
-        if(conv_node->input_edge(2) != nullptr)
+        if (conv_node->input_edge(2) != nullptr)
         {
             auto conv_bias_id = conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -91,45 +131,33 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         g.add_connection(bn_mean_id, 0, fused_id, 3);
         g.add_connection(bn_var_id, 0, fused_id, 4);
 
-        if(bn_node->input_edge(3) != nullptr)
+        if (bn_node->input_edge(3) != nullptr)
         {
             const auto bn_beta_id = bn_node->input_edge(3)->producer_id();
             g.add_connection(bn_beta_id, 0, fused_id, 5);
         }
 
-        if(bn_node->input_edge(4) != nullptr)
+        if (bn_node->input_edge(4) != nullptr)
         {
             const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
             g.add_connection(bn_gamma_id, 0, fused_id, 6);
         }
 
-        auto                     fused_node       = g.node(fused_id);
-        std::vector<NodeIdxPair> bn_driving_nodes = get_driving_nodes(*bn_node);
+        auto fused_node   = g.node(fused_id);
+        auto bn_node_name = bn_node->name();
 
-        // Extract batch normalization node accessor if any
-        auto bn_node_accessor = bn_node->output(0)->extract_accessor();
-        auto bn_node_name     = bn_node->name();
+        transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
-        // Remove batch normalization node
-        g.remove_node(bn_node->id());
-
-        // Get driving nodes of batch normalization node
-        for(auto &driving_node : bn_driving_nodes)
-        {
-            g.add_connection(fused_id, 0, driving_node.node_id, driving_node.index);
-            configure_tensor(fused_node->output(0));
-        }
-        // Update fused node outputs
-        fused_node->output(0)->set_accessor(std::move(bn_node_accessor));
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(NodeParams{conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
     }
 }
 
@@ -137,14 +165,17 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
-    auto *depth_conv_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node         = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *depth_conv_node =
+        arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(depth_conv_node->output(0)->accessor() == nullptr)
+    if (depth_conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = depth_conv_node->assigned_target();
 
@@ -164,9 +195,10 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         const auto epsilon     = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
+        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
 
-        if(depth_conv_node->input_edge(2) != nullptr)
+        if (depth_conv_node->input_edge(2) != nullptr)
         {
             const auto conv_bias_id = depth_conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -180,38 +212,29 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         g.add_connection(bn_beta_id, 0, fused_id, 5);
         g.add_connection(bn_gamma_id, 0, fused_id, 6);
 
-        auto                     fused_node       = g.node(fused_id);
-        std::vector<NodeIdxPair> bn_driving_nodes = get_driving_nodes(*bn_node);
-
-        // Extract batch normalization node accessor if any
-        auto bn_node_accessor = bn_node->output(0)->extract_accessor();
-        auto bn_node_name     = bn_node->name();
+        auto fused_node   = g.node(fused_id);
+        auto bn_node_name = bn_node->name();
 
-        // Remove batch normalization node
-        g.remove_node(bn_node->id());
+        transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
-        // Get driving nodes of batch normalization node
-        for(auto &driving_node : bn_driving_nodes)
-        {
-            g.add_connection(fused_id, 0, driving_node.node_id, driving_node.index);
-            configure_tensor(fused_node->output(0));
-        }
-        // Update fused node outputs
-        fused_node->output(0)->set_accessor(std::move(bn_node_accessor));
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ depth_conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(
+            NodeParams{depth_conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(depth_conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the "
+                                      "presence of an output accessor\n");
     }
 }
 
 template <typename N>
-void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+void fuse_node_with_activation(Graph                      &g,
+                               const Edge                 *output_edge,
+                               const std::set<Activation> &supported_fused_activations)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
@@ -221,64 +244,126 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set
     ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
     // Check if activation is supported for fusion
-    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    if (supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    {
+        return;
+    }
+
+    // EltwiseLayerNode can only be fused when dataype is float
+    if (n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
     {
         return;
     }
 
     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
-                                  << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+                                                           << " with Activation Layer node with ID : "
+                                                           << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(n_node->output(0)->accessor() == nullptr)
+    if (n_node->output(0)->accessor() == nullptr)
     {
-        // Get driving nodes of activation node
-        std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
-
         // Set activation info to fused node
         n_node->set_fused_activation(act_node->activation_info());
 
-        // Extract activation node accessor if any
-        auto act_node_accessor = act_node->output(0)->extract_accessor();
+        transfer_driving_nodes_and_remove_old_node(g, n_node, act_node, false);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of node with activation due to the presence of an output accessor\n");
+    }
+}
+
+template <typename N>
+void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
+{
+    auto *pad_node  = arm_compute::utils::cast::polymorphic_downcast<PadLayerNode *>(output_edge->producer());
+    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->consumer());
+
+    const Edge *input_edge = pad_node->input_edge(0);
+    if (input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr &&
+        pad_node->pad_value().get<float>() == 0.0)
+    {
+        const DataLayout  layout       = input_edge->tensor()->desc().layout;
+        const PaddingList padding_list = pad_node->padding();
 
-        // Remove activation node
-        g.remove_node(act_node->id());
+        const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT);
+        const unsigned int width_index  = get_dimension_idx(layout, DataLayoutDimension::WIDTH);
 
-        // Update fused node outputs
-        for(auto &driving_node : act_driving_nodes)
+        const PaddingInfo pad_w = width_index < padding_list.size() ? padding_list[width_index] : PaddingInfo(0, 0);
+        const PaddingInfo pad_h = height_index < padding_list.size() ? padding_list[height_index] : PaddingInfo(0, 0);
+
+        if (is_padding_in_height_or_width(layout, padding_list))
         {
-            g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
+            // Add paddings to the convolution node
+            const PadStrideInfo conv_info = conv_node->convolution_info();
+            const PadStrideInfo new_conv_info(conv_info.stride().first, conv_info.stride().second,
+                                              conv_info.pad_left() + pad_w.first, conv_info.pad_right() + pad_w.second,
+                                              conv_info.pad_top() + pad_h.first, conv_info.pad_bottom() + pad_h.second,
+                                              conv_info.round());
+            conv_node->set_convolution_info(new_conv_info);
+
+            // Update drivers of the convolution node
+            std::vector<NodeIdxPair> pad_driver_nodes = get_driver_nodes(*pad_node);
+            g.remove_node(pad_node->id());
+
+            // Update fused node inputs
+            for (auto &driver_node : pad_driver_nodes)
+            {
+                g.add_connection(driver_node.node_id, driver_node.index, conv_node->id(), 0);
+            }
         }
-
-        // Update accessor to fused node
-        n_node->output(0)->set_accessor(std::move(act_node_accessor));
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
     }
 }
 
 template <typename N1, typename N2, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
-    // Not interested in the order of nodes
-    for(auto &node : g.nodes())
+    // Note that fused nodes may be added to the end of the node list.
+    // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
+    // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
-        // Check if the node is of type N and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        auto node = g.node(i);
+        // Check if the node is of type N1 and not a branching node
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
-            // Check if following node is an activation layer node
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
+            // Check if following node is a type N2 node
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) &&
+                (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, optional_arguments...);
             }
         }
     }
 }
+
+template <typename N1, typename F, typename... Args>
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
+{
+    // Note that fused nodes may be added to the end of the node list.
+    // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
+    // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
+    {
+        auto node = g.node(i);
+        // Check if the node is of type N1 and not a branching node
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        {
+            const auto output_edge_id = *node->output_edges().begin();
+            const auto output_edge    = g.edge(output_edge_id);
+
+            // Check if it's the correct target
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
+            {
+                fuse_fcn(g, output_edge, i, optional_arguments...);
+            }
+        }
+    }
+}
 } // namespace detail
 
 const char *NodeFusionMutator::name()
@@ -294,41 +379,50 @@ IGraphMutator::MutationType NodeFusionMutator::type() const
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations_conv    = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
-    const std::set<Activation> supported_fused_activations_eltwise = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU,
-                                                                       Activation::TANH, Activation::LOGISTIC
-                                                                     };
+    const std::set<Activation> supported_fused_activations = {
+        Activation::ABS,        Activation::BOUNDED_RELU, Activation::ELU,
+        Activation::HARD_SWISH, Activation::IDENTITY,     Activation::LEAKY_RELU,
+        Activation::LINEAR,     Activation::LOGISTIC,     Activation::LU_BOUNDED_RELU,
+        Activation::RELU,       Activation::SOFT_RELU,    Activation::SQRT,
+        Activation::SQUARE,     Activation::TANH};
 
     // Preconditions
-    auto empty_prec = [](INode &)
-    {
-        return true;
-    };
-    auto cl_target_prec = [](INode & n)
-    {
-        return n.assigned_target() == Target::CL;
-    };
-    auto qs8_prec = [&g](INode & n)
+    auto empty_prec     = [](INode &) { return true; };
+    auto cl_target_prec = [](INode &n) { return n.assigned_target() == Target::CL; };
+    auto qs8_prec       = [&g](INode &n)
     {
         ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
 
         const auto output_edge_id = *n.output_edges().begin();
         const auto output_edge    = g.edge(output_edge_id);
         // To perform fusion the two nodes must have same output quantization information
-        const bool same_qinfo     = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+        const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
         const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
 
         return (output_qasymm8 && same_qinfo) || !output_qasymm8;
     };
 
     // Fusion mutations
-    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations_eltwise);
-    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
+
+    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec,
+                                                           detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
+    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(
+        g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(
+        g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
+    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(
+        g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
+    // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 3ba73071ed..533f8944cf 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,7 @@ IGraphMutator::MutationType SplitLayerSubTensorMutator::type() const
 void SplitLayerSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Split layers exist in graph
-    if(g.nodes(NodeType::SplitLayer).empty())
+    if (g.nodes(NodeType::SplitLayer).empty())
     {
         return;
     }
@@ -59,43 +59,46 @@ void SplitLayerSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
         {
             // Get output tensor
             Tensor *input_tensor = node->input(0);
 
             // Check that all tensor have the same target and are valid
             bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
-                                        [&](const TensorID & tid)
-            {
-                return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
-            });
+                                        [&](const TensorID &tid) {
+                                            return (g.tensor(tid) != nullptr) &&
+                                                   (g.tensor(tid)->desc().target == input_tensor->desc().target);
+                                        });
 
             // Create subtensors
-            if(is_valid && is_target_supported(input_tensor->desc().target))
+            if (is_valid && is_target_supported(input_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
 
                 auto *split_node = arm_compute::utils::cast::polymorphic_downcast<SplitLayerNode *>(node);
 
-                const unsigned int axis          = split_node->axis();
+                const int          axis          = split_node->axis();
                 const unsigned int num_splits    = split_node->num_splits();
                 const bool         extend_parent = (axis < 2);
 
                 // Create sub-tensor handles
-                for(unsigned int i = 0; i < node->outputs().size(); ++i)
+                for (unsigned int i = 0; i < node->outputs().size(); ++i)
                 {
                     Tensor           *output_tensor = node->output(i);
                     const TensorShape output_shape  = output_tensor->desc().shape;
                     Coordinates       coords;
-                    std::tie(std::ignore, coords) = SplitLayerNode::compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+                    std::tie(std::ignore, coords) =
+                        split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
                     output_tensor->set_handle(std::move(handle));
                 }
             }
diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index 0a9f5058dd..3dc2480e85 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,10 @@
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <set>
 
@@ -62,14 +62,12 @@ public:
  */
 bool is_mutation_supported(Graph &g)
 {
-    const std::set<NodeType> unsupported_node_types = { NodeType::DetectionOutputLayer,
-                                                        NodeType::NormalizationLayer,
-                                                        NodeType::PriorBoxLayer
-                                                      };
+    const std::set<NodeType> unsupported_node_types = {NodeType::DetectionOutputLayer, NodeType::NormalizationLayer,
+                                                       NodeType::PriorBoxLayer};
 
-    for(const auto &utype : unsupported_node_types)
+    for (const auto &utype : unsupported_node_types)
     {
-        if(!g.nodes(utype).empty())
+        if (!g.nodes(utype).empty())
         {
             return false;
         }
@@ -83,12 +81,12 @@ bool is_mutation_supported(Graph &g)
  */
 void remove_optimized_nodes(Graph &g)
 {
-    const std::set<NodeType> optimized_node_types = { NodeType::BatchNormalizationLayer };
+    const std::set<NodeType> optimized_node_types = {NodeType::BatchNormalizationLayer};
 
-    for(const auto &opt_type : optimized_node_types)
+    for (const auto &opt_type : optimized_node_types)
     {
         const std::vector<NodeID> opt_nodes_ids = g.nodes(opt_type);
-        for(const auto &node_id : opt_nodes_ids)
+        for (const auto &node_id : opt_nodes_ids)
         {
             INode *node = g.node(node_id);
 
@@ -108,7 +106,7 @@ void remove_optimized_nodes(Graph &g)
             g.remove_node(node->id());
 
             // Update connections
-            for(auto &driving_node : driving_nodes)
+            for (auto &driving_node : driving_nodes)
             {
                 g.add_connection(producer->id(), producer_edge_id, driving_node.node_id, driving_node.index);
             }
@@ -120,15 +118,28 @@ void remove_optimized_nodes(Graph &g)
  *
  * @param[in,out] g Graph to convert tensors of.
  */
-void convert_tensors(Graph &g)
+void convert_tensors(Graph &g, DataType data_type)
 {
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor != nullptr)
+        if (tensor != nullptr)
         {
-            tensor->desc().data_type  = DataType::QASYMM8;
-            tensor->desc().quant_info = QuantizationInfo(0.125f, -10);
+            switch (data_type)
+            {
+                case DataType::QASYMM8:
+                case DataType::QASYMM8_SIGNED:
+                {
+                    tensor->desc().quant_info = QuantizationInfo(0.125f, -10);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported mutation type");
+                    break;
+                }
+            }
+            tensor->desc().data_type = data_type;
         }
     }
 }
@@ -143,7 +154,7 @@ template <typename NT>
 void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const &f)
 {
     const std::vector<NodeID> nodes_ids = g.nodes(NT::node_type);
-    for(const auto &nodes_id : nodes_ids)
+    for (const auto &nodes_id : nodes_ids)
     {
         INode *node = arm_compute::utils::cast::polymorphic_downcast<NT *>(g.node(nodes_id));
         ARM_COMPUTE_ERROR_ON(node == nullptr);
@@ -161,23 +172,44 @@ void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const
  */
 void convert_special_tensors(Graph &g)
 {
-    auto softmax_func = [](INode * node, Tensor * tensor)
+    auto softmax_func = [](INode *node, Tensor *tensor)
     {
         ARM_COMPUTE_UNUSED(node);
-        tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
+        if (tensor->desc().data_type == DataType::QASYMM8)
+        {
+            tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
+        }
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        {
+            tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
+        }
         return true;
     };
 
-    auto act_func = [](INode * node, Tensor * tensor)
+    auto act_func = [](INode *node, Tensor *tensor)
     {
         auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(node);
-        if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
-            tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128);
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128);
+            }
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
+            }
         }
-        else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
-            tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 0);
+            }
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
+            }
         }
         return true;
     };
@@ -194,22 +226,19 @@ void convert_special_tensors(Graph &g)
  */
 void handle_nodes_with_bias(Graph &g)
 {
-    const std::set<NodeType> special_node_types = { NodeType::ConvolutionLayer,
-                                                    NodeType::DeconvolutionLayer,
-                                                    NodeType::DepthwiseConvolutionLayer,
-                                                    NodeType::FullyConnectedLayer
-                                                  };
+    const std::set<NodeType> special_node_types = {NodeType::ConvolutionLayer, NodeType::DeconvolutionLayer,
+                                                   NodeType::DepthwiseConvolutionLayer, NodeType::FullyConnectedLayer};
 
-    for(const auto &spc_type : special_node_types)
+    for (const auto &spc_type : special_node_types)
     {
         const std::vector<NodeID> scp_nodes_ids = g.nodes(spc_type);
-        for(const auto &node_id : scp_nodes_ids)
+        for (const auto &node_id : scp_nodes_ids)
         {
             INode *node = g.node(node_id);
-            if(node != nullptr)
+            if (node != nullptr)
             {
                 Tensor *tensor = node->input(2);
-                if(tensor != nullptr)
+                if (tensor != nullptr)
                 {
                     tensor->desc().data_type = DataType::S32;
                 }
@@ -219,10 +248,10 @@ void handle_nodes_with_bias(Graph &g)
                     params.name = params.name.empty() ? "" : params.name + "Bias";
 
                     TensorDescriptor b_desc = node->input(1)->desc();
-                    auto             depth  = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
-                    b_desc.shape            = TensorShape(depth);
+                    auto depth   = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
+                    b_desc.shape = TensorShape(depth);
 
-                    auto accessor = support::cpp14::make_unique<EmptyAccessor>();
+                    auto accessor = std::make_unique<EmptyAccessor>();
                     auto b_nid    = GraphBuilder::add_const_node(g, params, b_desc, std::move(accessor));
                     g.add_connection(b_nid, 0, node_id, 2);
                 }
@@ -232,6 +261,10 @@ void handle_nodes_with_bias(Graph &g)
 }
 } // namespace
 
+SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) : _mutate_type{mutate_type}
+{
+}
+
 const char *SyntheticDataTypeMutator::name()
 {
     return "SyntheticDataTypeMutator";
@@ -244,13 +277,13 @@ IGraphMutator::MutationType SyntheticDataTypeMutator::type() const
 
 void SyntheticDataTypeMutator::mutate(Graph &g)
 {
-    if(is_mutation_supported(g))
+    if (is_mutation_supported(g))
     {
         // Remove nodes that get optimized out (e.g. BatchNorm)
         remove_optimized_nodes(g);
 
         // Convert tensor
-        convert_tensors(g);
+        convert_tensors(g, _mutate_type);
         convert_special_tensors(g);
 
         // Handle special nodes