From 88d5b22eb5574d8b564474df2c758d222b3b5547 Mon Sep 17 00:00:00 2001
From: Isabella Gottardi <isabella.gottardi@arm.com>
Date: Fri, 6 Apr 2018 12:24:55 +0100
Subject: COMPMID-1035 - Add ResneXt50 as a graph example

Change-Id: I42f0e7dab38e45b5eecfe6858eaecee8939c8585
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129291
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/utils/misc/Utility.h          |  10 ++
 arm_compute/graph/GraphBuilder.h               |  14 ++
 arm_compute/graph/Types.h                      |   1 +
 arm_compute/graph/frontend/Layers.h            |  27 ++++
 arm_compute/graph/nodes/EltwiseLayerNode.h     |  20 ++-
 examples/graph_alexnet.cpp                     |   2 +-
 examples/graph_googlenet.cpp                   |   4 +-
 examples/graph_inception_v3.cpp                |   2 +-
 examples/graph_lenet.cpp                       |   2 +-
 examples/graph_mobilenet.cpp                   |   2 +-
 examples/graph_resnet50.cpp                    |   6 +-
 examples/graph_resnext50.cpp                   | 208 +++++++++++++++++++++++++
 examples/graph_squeezenet.cpp                  |   4 +-
 examples/graph_squeezenet_v1_1.cpp             |   4 +-
 examples/graph_vgg16.cpp                       |   4 +-
 examples/graph_vgg19.cpp                       |   4 +-
 src/graph/GraphBuilder.cpp                     |  29 ++++
 src/graph/backends/CL/CLFunctionsFactory.cpp   |  17 +-
 src/graph/backends/GLES/GCFunctionsFactory.cpp |  11 +-
 src/graph/backends/NEON/NEFunctionFactory.cpp  |  15 +-
 src/graph/nodes/EltwiseLayerNode.cpp           |  17 +-
 utils/GraphUtils.cpp                           |  39 +++++
 utils/GraphUtils.h                             |  64 +++++++-
 utils/Utils.h                                  |  37 +++++
 24 files changed, 499 insertions(+), 44 deletions(-)
 create mode 100644 examples/graph_resnext50.cpp
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index 639f2e155d..f30a417a09 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -164,6 +164,16 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
 
     return idx;
 }
+
+inline bool endswith(const std::string &filename, const std::string &suffix)
+{
+    if(filename.size() < suffix.size())
+    {
+        return false;
+    }
+    return std::equal(suffix.rbegin(), suffix.rend(), filename.rbegin());
+}
+
 } // namespace utility
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_MISC_UTILITY_H__ */
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index aea28eb8d6..04edf673d1 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -213,6 +213,20 @@ public:
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_reshape_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape);
+    /** Adds a scale layer node to the graph
+     * This layer computes a product of the input with a scale (read from mul_accessor) and it applies an offset (read from add_accessor).
+     * output = input * mul_w + add_w
+     *
+     * @param[in] g            Graph to add the layer to
+     * @param[in] params       Common node parameters
+     * @param[in] input        Input to the fully connected layer node as a NodeID-Index pair
+     * @param[in] mul_accessor (Optional) Accessor of the mul node data
+     * @param[in] add_accessor (Optional) Accessor of the add node data
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input,
+                                  ITensorAccessorUPtr mul_accessor = nullptr, ITensorAccessorUPtr add_accessor = nullptr);
     /** Adds a softmax node to the graph
      *
      * @param[in] g      Graph to add the node to
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index a910610c7a..d4e4f99377 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -137,6 +137,7 @@ enum class NodeType
     NormalizationLayer,
     PoolingLayer,
     ReshapeLayer,
+    ScaleLayer,
     SoftmaxLayer,
     SplitLayer,
 
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index d122a7a967..a97684453c 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -380,6 +380,33 @@ private:
     TensorShape _shape;
 };
 
+/** Scale Layer */
+class ScaleLayer final : public ILayer
+{
+public:
+    /** Construct a scale layer.
+     *
+     * @param[in] mul_w Accessor to get mul weight from.
+     * @param[in] add_w Accessor to get add weight from.
+     */
+    ScaleLayer(ITensorAccessorUPtr mul_w,
+               ITensorAccessorUPtr add_w)
+        : _mul_w(std::move(mul_w)), _add_w(std::move(add_w))
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_scale_layer(s.graph(), common_params, input, std::move(_mul_w), std::move(_add_w));
+    }
+
+private:
+    ITensorAccessorUPtr _mul_w;
+    ITensorAccessorUPtr _add_w;
+};
+
 /** Softmax Layer */
 class SoftmaxLayer final : public ILayer
 {
diff --git a/arm_compute/graph/nodes/EltwiseLayerNode.h b/arm_compute/graph/nodes/EltwiseLayerNode.h
index 5b9fa84bbb..09cbc75b80 100644
--- a/arm_compute/graph/nodes/EltwiseLayerNode.h
+++ b/arm_compute/graph/nodes/EltwiseLayerNode.h
@@ -36,15 +36,29 @@ class EltwiseLayerNode final : public INode
 public:
     /** Constructor
      *
-     * @param[in] op Element-wise operation to perform
+     * @param[in] op       Element-wise operation to perform
+     * @param[in] c_policy (Optional) Convert policy used for the operation
+     * @param[in] r_policy (Optional) Rounding policy used for the operation
      */
-    EltwiseLayerNode(EltwiseOperation op);
+    EltwiseLayerNode(EltwiseOperation op, ConvertPolicy c_policy = ConvertPolicy::SATURATE, RoundingPolicy r_policy = RoundingPolicy::TO_ZERO);
     /** Eltwise operation accessor
      *
      * @return Eltwise operation that is to be performed by the node
      */
     EltwiseOperation eltwise_operation() const;
 
+    /** Convert policy accessor
+     *
+     * @return Convert policy that is used in the node
+     */
+    ConvertPolicy convert_policy() const;
+
+    /** Rounding policy accessor
+     *
+     * @return Convert policy that is used in the node
+     */
+    RoundingPolicy rounding_policy() const;
+
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
@@ -53,6 +67,8 @@ public:
 
 private:
     EltwiseOperation _op;
+    ConvertPolicy    _convert_policy;
+    RoundingPolicy   _rounding_policy;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index ffcd8b8411..9e6d91962e 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -195,7 +195,7 @@ private:
 /** Main program for AlexNet
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index a47fc9d588..2dba67f5eb 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -36,7 +36,7 @@ using namespace arm_compute::graph_utils;
 /** Example demonstrating how to implement Googlenet's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphGooglenetExample : public Example
 {
@@ -215,7 +215,7 @@ private:
 /** Main program for Googlenet
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
index c92e69e6a7..d1d6ab4e05 100644
--- a/examples/graph_inception_v3.cpp
+++ b/examples/graph_inception_v3.cpp
@@ -36,7 +36,7 @@ using namespace arm_compute::graph_utils;
 /** Example demonstrating how to implement InceptionV3's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class InceptionV3Example : public Example
 {
diff --git a/examples/graph_lenet.cpp b/examples/graph_lenet.cpp
index 92be2d48c1..32c75827d3 100644
--- a/examples/graph_lenet.cpp
+++ b/examples/graph_lenet.cpp
@@ -136,7 +136,7 @@ private:
 /** Main program for LeNet
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] batches, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] batches, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 7bfc6808fa..50dc02482f 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -232,7 +232,7 @@ private:
 /** Main program for MobileNetV1
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL),
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner),
  *                             [optional] Model ID (0 = MobileNetV1_1.0_224, 1 = MobileNetV1_0.75_160),
  *                             [optional] Path to the weights folder,
  *                             [optional] image,
diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
index 7332bb8b15..bafa9a5852 100644
--- a/examples/graph_resnet50.cpp
+++ b/examples/graph_resnet50.cpp
@@ -32,10 +32,10 @@ using namespace arm_compute::utils;
 using namespace arm_compute::graph::frontend;
 using namespace arm_compute::graph_utils;
 
-/** Example demonstrating how to implement Microsoft's ResNet50 network using the Compute Library's graph API
+/** Example demonstrating how to implement ResNet50 network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphResNet50Example : public Example
 {
@@ -252,7 +252,7 @@ private:
 /** Main program for ResNet50
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
new file mode 100644
index 0000000000..f96a02e6d6
--- /dev/null
+++ b/examples/graph_resnext50.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph.h"
+#include "support/ToolchainSupport.h"
+#include "utils/GraphUtils.h"
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute::utils;
+using namespace arm_compute::graph::frontend;
+using namespace arm_compute::graph_utils;
+
+/** Example demonstrating how to implement ResNeXt50 network using the Compute Library's graph API
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] npy_in, [optional] npy_out, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ */
+class GraphResNeXt50Example : public Example
+{
+public:
+    void do_setup(int argc, char **argv) override
+    {
+        std::string data_path; /* Path to the trainable data */
+        std::string npy_in;    /* Input npy data */
+        std::string npy_out;   /* Output npy data */
+
+        // Set target. 0 (NEON), 1 (OpenCL), 2 (OpenCL with Tuner). By default it is NEON
+        const int    target         = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
+        Target       target_hint    = set_target_hint(target);
+        FastMathHint fast_math_hint = FastMathHint::DISABLED;
+
+        // Parse arguments
+        if(argc < 2)
+        {
+            // Print help
+            std::cout << "Usage: " << argv[0] << " [target] [path_to_data] [npy_in] [npy_out] [fast_math_hint]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 2)
+        {
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " [path_to_data] [npy_in] [npy_out] [fast_math_hint]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 3)
+        {
+            data_path = argv[2];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " [npy_in] [npy_out] [fast_math_hint]\n\n";
+            std::cout << "No input npy file provided: using random values\n\n";
+        }
+        else if(argc == 4)
+        {
+            data_path = argv[2];
+            npy_in    = argv[3];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " [npy_out] [fast_math_hint]\n\n";
+            std::cout << "No output npy file provided: skipping output accessor\n\n";
+        }
+        else if(argc == 5)
+        {
+            data_path = argv[2];
+            npy_in    = argv[3];
+            npy_out   = argv[4];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " " << argv[4] << " [fast_math_hint]\n\n";
+            std::cout << "No fast math info provided: disabling fast math\n\n";
+        }
+        else
+        {
+            data_path      = argv[2];
+            npy_in         = argv[3];
+            npy_out        = argv[4];
+            fast_math_hint = (std::strtol(argv[5], nullptr, 1) == 0) ? FastMathHint::DISABLED : FastMathHint::ENABLED;
+        }
+
+        graph << target_hint
+              << fast_math_hint
+              << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), DataType::F32),
+                            get_input_accessor(npy_in))
+              << ScaleLayer(get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_mul.npy"),
+                            get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_add.npy"))
+              .set_name("bn_data/Scale")
+              << ConvolutionLayer(
+                  7U, 7U, 64U,
+                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
+                  PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
+              .set_name("conv0/Convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool0");
+
+        add_residual_block(data_path, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3, /*stride_conv_unit1*/ 1);
+        add_residual_block(data_path, 512, 2, 4, 2);
+        add_residual_block(data_path, 1024, 3, 6, 2);
+        add_residual_block(data_path, 2048, 4, 3, 2);
+
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)).set_name("pool1")
+              << FlattenLayer().set_name("predictions/Reshape")
+              << OutputLayer(get_npy_output_accessor(npy_out, TensorShape(2048U), DataType::F32));
+
+        // Finalize graph
+        GraphConfig config;
+        config.use_tuner = (target == 2);
+        graph.finalize(target_hint, config);
+    }
+
+    void do_run() override
+    {
+        // Run graph
+        graph.run();
+    }
+
+private:
+    Stream graph{ 0, "ResNeXt50" };
+
+    void add_residual_block(const std::string &data_path, unsigned int base_depth, unsigned int stage, unsigned int num_units, unsigned int stride_conv_unit1)
+    {
+        for(unsigned int i = 0; i < num_units; ++i)
+        {
+            std::stringstream unit_path_ss;
+            unit_path_ss << "/cnn_data/resnext50_model/stage" << stage << "_unit" << (i + 1) << "_";
+            std::string unit_path = unit_path_ss.str();
+
+            std::stringstream unit_name_ss;
+            unit_name_ss << "stage" << stage << "/unit" << (i + 1) << "/";
+            std::string unit_name = unit_name_ss.str();
+
+            PadStrideInfo pad_grouped_conv(1, 1, 1, 1);
+            if(i == 0)
+            {
+                pad_grouped_conv = (stage == 1) ? PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 1, 1) : PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 1, 0, 1, DimensionRoundingType::FLOOR);
+            }
+
+            SubStream right(graph);
+            right << ConvolutionLayer(
+                      1U, 1U, base_depth / 2,
+                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
+                      PadStrideInfo(1, 1, 0, 0))
+                  .set_name(unit_name + "conv1/convolution")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
+
+                  << ConvolutionLayer(
+                      3U, 3U, base_depth / 2,
+                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy"),
+                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                      pad_grouped_conv, 32)
+                  .set_name(unit_name + "conv2/convolution")
+                  << ScaleLayer(get_weights_accessor(data_path, unit_path + "bn2_mul.npy"),
+                                get_weights_accessor(data_path, unit_path + "bn2_add.npy"))
+                  .set_name(unit_name + "conv1/Scale")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv2/Relu")
+
+                  << ConvolutionLayer(
+                      1U, 1U, base_depth,
+                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy"),
+                      get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
+                      PadStrideInfo(1, 1, 0, 0))
+                  .set_name(unit_name + "conv3/convolution");
+
+            SubStream left(graph);
+            if(i == 0)
+            {
+                left << ConvolutionLayer(
+                         1U, 1U, base_depth,
+                         get_weights_accessor(data_path, unit_path + "sc_weights.npy"),
+                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                         PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
+                     .set_name(unit_name + "sc/convolution")
+                     << ScaleLayer(get_weights_accessor(data_path, unit_path + "sc_bn_mul.npy"),
+                                   get_weights_accessor(data_path, unit_path + "sc_bn_add.npy"))
+                     .set_name(unit_name + "sc/scale");
+            }
+
+            graph << BranchLayer(BranchMergeMethod::ADD, std::move(left), std::move(right)).set_name(unit_name + "add");
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+        }
+    }
+};
+
+/** Main program for ResNeXt50
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [[optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] npy_in, [optional] npy_out )
+ */
+int main(int argc, char **argv)
+{
+    return arm_compute::utils::run_example<GraphResNeXt50Example>(argc, argv);
+}
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index 4d7bcf1ca8..b632688839 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -37,7 +37,7 @@ using namespace arm_compute::logging;
 /** Example demonstrating how to implement Squeezenet's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphSqueezenetExample : public Example
 {
@@ -218,7 +218,7 @@ private:
 /** Main program for Squeezenet v1.0
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp
index f5fede2f70..9e3466b993 100644
--- a/examples/graph_squeezenet_v1_1.cpp
+++ b/examples/graph_squeezenet_v1_1.cpp
@@ -40,7 +40,7 @@ namespace
 /** Example demonstrating how to implement Squeezenet's v1.1 network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphSqueezenet_v1_1Example : public Example
 {
@@ -223,7 +223,7 @@ private:
 /** Main program for Squeezenet v1.1
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index 6db4e386de..72e724025b 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -35,7 +35,7 @@ using namespace arm_compute::graph_utils;
 /** Example demonstrating how to implement VGG16's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphVGG16Example : public Example
 {
@@ -257,7 +257,7 @@ private:
 /** Main program for VGG16
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index 5a281ea86a..b15c3f2def 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp
@@ -35,7 +35,7 @@ using namespace arm_compute::graph_utils;
 /** Example demonstrating how to implement VGG19's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class GraphVGG19Example : public Example
 {
@@ -270,7 +270,7 @@ private:
 /** Main program for VGG19
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index df94d0b169..4c5d30a33f 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -399,6 +399,35 @@ NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
 }
 
+NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create mul node
+    TensorDescriptor mul_desc = input_tensor_desc;
+    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(mul_desc, DataLayoutDimension::CHANNEL)];
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), 1);
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), 1);
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL), C);
+    NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
+    NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+
+    // Create add node
+    TensorDescriptor add_desc        = mul_desc;
+    NodeID           add_const_nid   = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
+    NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
+
+    // Create node and connect
+    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::MUL);
+    NodeIdxPair mulnode_nidxp = { mul_node, 0 };
+    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::ADD);
+
+    return add_node;
+}
+
 NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair input, float beta)
 {
     return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 4626cb5781..ac04f1063c 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -313,10 +313,11 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
-    ICLTensor             *input1     = get_backing_tensor(node.input(0));
-    ICLTensor             *input2     = get_backing_tensor(node.input(1));
-    ICLTensor             *output     = get_backing_tensor(node.output(0));
-    const EltwiseOperation eltwise_op = node.eltwise_operation();
+    ICLTensor             *input1         = get_backing_tensor(node.input(0));
+    ICLTensor             *input2         = get_backing_tensor(node.input(1));
+    ICLTensor             *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
     ARM_COMPUTE_ERROR_ON(input1 == nullptr);
     ARM_COMPUTE_ERROR_ON(input2 == nullptr);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
@@ -327,18 +328,18 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     {
         std::tie(func, func_name) = create_named_function<CLArithmeticAddition>(std::string("CLArithmeticAddition"),
                                                                                 input1, input2, output,
-                                                                                ConvertPolicy::SATURATE);
+                                                                                convert_policy);
     }
     else if(eltwise_op == EltwiseOperation::SUB)
     {
         std::tie(func, func_name) = create_named_function<CLArithmeticSubtraction>(
-                                        std::string("CLArithmeticSubtraction"), input1, input2, output, ConvertPolicy::SATURATE);
+                                        std::string("CLArithmeticSubtraction"), input1, input2, output, convert_policy);
     }
     else if(eltwise_op == EltwiseOperation::MUL)
     {
         std::tie(func, func_name) = create_named_function<CLPixelWiseMultiplication>(
-                                        std::string("CLPixelWiseMultiplication"), input1, input2, output, 1.f, ConvertPolicy::SATURATE,
-                                        RoundingPolicy::TO_NEAREST_EVEN);
+                                        std::string("CLPixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy,
+                                        node.rounding_policy());
     }
     else
     {
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index d3c5737e68..d53daf1109 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -301,10 +301,11 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
-    IGCTensor             *input1     = get_backing_tensor(node.input(0));
-    IGCTensor             *input2     = get_backing_tensor(node.input(1));
-    IGCTensor             *output     = get_backing_tensor(node.output(0));
-    const EltwiseOperation eltwise_op = node.eltwise_operation();
+    IGCTensor             *input1         = get_backing_tensor(node.input(0));
+    IGCTensor             *input2         = get_backing_tensor(node.input(1));
+    IGCTensor             *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
     ARM_COMPUTE_ERROR_ON(input1 == nullptr);
     ARM_COMPUTE_ERROR_ON(input2 == nullptr);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
@@ -315,7 +316,7 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     {
         std::tie(func, func_name) = create_named_function<GCArithmeticAddition>(std::string("GCArithmeticAddition"),
                                                                                 input1, input2, output,
-                                                                                ConvertPolicy::SATURATE);
+                                                                                convert_policy);
     }
     else if(eltwise_op == EltwiseOperation::SUB)
     {
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 7a37dfa39d..7f97876e57 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -294,10 +294,11 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
-    ITensor               *input1     = get_backing_tensor(node.input(0));
-    ITensor               *input2     = get_backing_tensor(node.input(1));
-    ITensor               *output     = get_backing_tensor(node.output(0));
-    const EltwiseOperation eltwise_op = node.eltwise_operation();
+    ITensor               *input1         = get_backing_tensor(node.input(0));
+    ITensor               *input2         = get_backing_tensor(node.input(1));
+    ITensor               *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
     ARM_COMPUTE_ERROR_ON(input1 == nullptr);
     ARM_COMPUTE_ERROR_ON(input2 == nullptr);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
@@ -307,18 +308,18 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     if(eltwise_op == EltwiseOperation::ADD)
     {
         std::tie(func, func_name) = create_named_function<NEArithmeticAddition>(std::string("NEArithmeticAddition"),
-                                                                                input1, input2, output, ConvertPolicy::SATURATE);
+                                                                                input1, input2, output, convert_policy);
     }
     else if(eltwise_op == EltwiseOperation::SUB)
     {
         std::tie(func, func_name) = create_named_function<NEArithmeticSubtraction>(std::string("NEArithmeticSubtraction"),
-                                                                                   input1, input2, output, ConvertPolicy::SATURATE);
+                                                                                   input1, input2, output, convert_policy);
     }
     else if(eltwise_op == EltwiseOperation::MUL)
     {
         std::tie(func, func_name) = create_named_function<NEPixelWiseMultiplication>(std::string("NEPixelWiseMultiplication"),
                                                                                      input1, input2, output, 1.f,
-                                                                                     ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+                                                                                     convert_policy, node.rounding_policy());
     }
     else
     {
diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 6f1e0eecd9..568b882425 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp
@@ -30,8 +30,8 @@ namespace arm_compute
 {
 namespace graph
 {
-EltwiseLayerNode::EltwiseLayerNode(EltwiseOperation op)
-    : _op(op)
+EltwiseLayerNode::EltwiseLayerNode(EltwiseOperation op, ConvertPolicy c_policy, RoundingPolicy r_policy)
+    : _op(op), _convert_policy(c_policy), _rounding_policy(r_policy)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -42,6 +42,16 @@ EltwiseOperation EltwiseLayerNode::eltwise_operation() const
     return _op;
 }
 
+ConvertPolicy EltwiseLayerNode::convert_policy() const
+{
+    return _convert_policy;
+}
+
+RoundingPolicy EltwiseLayerNode::rounding_policy() const
+{
+    return _rounding_policy;
+}
+
 bool EltwiseLayerNode::forward_descriptors()
 {
     if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
@@ -56,8 +66,7 @@ bool EltwiseLayerNode::forward_descriptors()
 
 TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
 {
-    ARM_COMPUTE_UNUSED(idx);
-    ARM_COMPUTE_UNUSED(_op);
+    ARM_COMPUTE_UNUSED(idx, _op, _convert_policy, _rounding_policy);
 
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index 145e44950b..0edb6f2a56 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -129,6 +129,45 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
     return ret;
 }
 
+NumPyAccessor::NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, std::ostream &output_stream)
+    : _npy_tensor(), _filename(std::move(npy_path)), _output_stream(output_stream)
+{
+    NumPyBinLoader loader(_filename);
+
+    TensorInfo info(shape, 1, data_type);
+    _npy_tensor.allocator()->init(info);
+    _npy_tensor.allocator()->allocate();
+
+    loader.access_tensor(_npy_tensor);
+}
+
+template <typename T>
+void NumPyAccessor::access_numpy_tensor(ITensor &tensor)
+{
+    const int num_elements          = tensor.info()->total_size();
+    int       num_mismatches        = utils::compare_tensor<T>(tensor, _npy_tensor);
+    float     percentage_mismatches = static_cast<float>(num_mismatches) / num_elements;
+
+    _output_stream << "Results: " << 100.f - (percentage_mismatches * 100) << " % matches with the provided output[" << _filename << "]." << std::endl;
+}
+
+bool NumPyAccessor::access_tensor(ITensor &tensor)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(_npy_tensor.info()->dimension(0) != tensor.info()->dimension(0));
+
+    switch(tensor.info()->data_type())
+    {
+        case DataType::F32:
+            access_numpy_tensor<float>(tensor);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+    return false;
+}
+
 PPMAccessor::PPMAccessor(std::string ppm_path, bool bgr, std::unique_ptr<IPreprocessor> preprocessor)
     : _ppm_path(std::move(ppm_path)), _bgr(bgr), _preprocessor(std::move(preprocessor))
 {
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index a8507b1ac7..597708369d 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -25,9 +25,11 @@
 #define __ARM_COMPUTE_GRAPH_UTILS_H__
 
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/Tensor.h"
 
 #include <array>
 #include <random>
@@ -117,6 +119,37 @@ private:
     unsigned int _maximum;
 };
 
+/** NumPy accessor class */
+class NumPyAccessor final : public graph::ITensorAccessor
+{
+public:
+    /** Constructor
+     *
+     * @param[in]  npy_path      Path to npy file.
+     * @param[in]  shape         Shape of the numpy tensor data.
+     * @param[in]  data_type     DataType of the numpy tensor data.
+     * @param[out] output_stream (Optional) Output stream
+     */
+    NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, std::ostream &output_stream = std::cout);
+    /** Allow instances of this class to be move constructed */
+    NumPyAccessor(NumPyAccessor &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NumPyAccessor(const NumPyAccessor &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NumPyAccessor &operator=(const NumPyAccessor &) = delete;
+
+    // Inherited methods overriden:
+    bool access_tensor(ITensor &tensor) override;
+
+private:
+    template <typename T>
+    void access_numpy_tensor(ITensor &tensor);
+
+    Tensor            _npy_tensor;
+    const std::string _filename;
+    std::ostream     &_output_stream;
+};
+
 /** PPM accessor class */
 class PPMAccessor final : public graph::ITensorAccessor
 {
@@ -273,7 +306,14 @@ inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const std::str
     }
     else
     {
-        return arm_compute::support::cpp14::make_unique<PPMAccessor>(ppm_path, bgr, std::move(preprocessor));
+        if(arm_compute::utility::endswith(ppm_path, ".npy"))
+        {
+            return arm_compute::support::cpp14::make_unique<NumPyBinLoader>(ppm_path);
+        }
+        else
+        {
+            return arm_compute::support::cpp14::make_unique<PPMAccessor>(ppm_path, bgr, std::move(preprocessor));
+        }
     }
 }
 
@@ -298,6 +338,28 @@ inline std::unique_ptr<graph::ITensorAccessor> get_output_accessor(const std::st
         return arm_compute::support::cpp14::make_unique<TopNPredictionsAccessor>(labels_path, top_n, output_stream);
     }
 }
+/** Generates appropriate npy output accessor according to the specified npy_path
+ *
+ * @note If npy_path is empty will generate a DummyAccessor else will generate a NpyAccessor
+ *
+ * @param[in]  npy_path      Path to npy file.
+ * @param[in]  shape         Shape of the numpy tensor data.
+ * @param[in]  data_type     DataType of the numpy tensor data.
+ * @param[out] output_stream (Optional) Output stream
+ *
+ * @return An appropriate tensor accessor
+ */
+inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std::string &npy_path, TensorShape shape, DataType data_type, std::ostream &output_stream = std::cout)
+{
+    if(npy_path.empty())
+    {
+        return arm_compute::support::cpp14::make_unique<DummyAccessor>(0);
+    }
+    else
+    {
+        return arm_compute::support::cpp14::make_unique<NumPyAccessor>(npy_path, shape, data_type, output_stream);
+    }
+}
 
 /** Utility function to return the TargetHint
  *
diff --git a/utils/Utils.h b/utils/Utils.h
index cadba3a088..6cb71fd3ba 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -924,6 +924,43 @@ void init_sgemm_output(T &dst, T &src0, T &src1, arm_compute::DataType dt)
  * @return The free memory in kB
  */
 uint64_t get_mem_free_from_meminfo();
+
+/** Compare to tensor
+ *
+ * @param[in] tensor1 First tensor to be compared.
+ * @param[in] tensor2 Second tensor to be compared.
+ *
+ * @return The number of mismatches
+ */
+template <typename T>
+int compare_tensor(ITensor &tensor1, ITensor &tensor2)
+{
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(&tensor1, &tensor2);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(&tensor1, &tensor2);
+
+    int    num_mismatches = 0;
+    Window window;
+    window.use_tensor_dimensions(tensor1.info()->tensor_shape());
+
+    map(tensor1, true);
+    map(tensor2, true);
+    Iterator itensor1(&tensor1, window);
+    Iterator itensor2(&tensor2, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(std::abs(*reinterpret_cast<T *>(itensor1.ptr()) - *reinterpret_cast<T *>(itensor2.ptr())) > 0.00001)
+        {
+            ++num_mismatches;
+        }
+    },
+    itensor1, itensor2);
+
+    unmap(itensor1);
+    unmap(itensor2);
+
+    return num_mismatches;
+}
 } // namespace utils
 } // namespace arm_compute
 #endif /* __UTILS_UTILS_H__*/
-- 
cgit v1.2.1