COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
author: Anthony Barbier <anthony.barbier@arm.com> 2017-09-04 18:44:23 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-09-17 13:03:09 +0100
commit: 6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree: a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src
download: ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
380 files changed, 63754 insertions, 0 deletions
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
new file mode 100644
index 0000000000..b75ebcfeb8
--- /dev/null
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowAutoPadding.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
+    : _info(info)
+{
+}
+
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(input_valid_region);
+    ARM_COMPUTE_UNUSED(border_undefined);
+    ARM_COMPUTE_UNUSED(border_size);
+
+    return compute_valid_region();
+}
+
+ValidRegion AccessWindowAutoPadding::compute_valid_region() const
+{
+    if(_info == nullptr)
+    {
+        return ValidRegion();
+    }
+
+    return ValidRegion(Coordinates(), _info->tensor_shape());
+}
+
+void AccessWindowAutoPadding::set_valid_region()
+{
+    if(_info == nullptr)
+    {
+        return;
+    }
+
+    _info->set_valid_region(compute_valid_region());
+}
+
+bool AccessWindowAutoPadding::update_window_if_needed(Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    return false;
+}
+
+bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    // Update strides in tensor info
+    return _info->auto_padding();
+}
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
new file mode 100644
index 0000000000..8b6419c485
--- /dev/null
+++ b/src/core/AccessWindowStatic.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowStatic.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y)
+    : _info(info), _start_x(start_x), _start_y(start_y), _end_x(end_x), _end_y(end_y)
+{
+}
+
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    ARM_COMPUTE_UNUSED(border_undefined);
+    ARM_COMPUTE_UNUSED(border_size);
+
+    return compute_valid_region(window, input_valid_region);
+}
+
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    TensorShape &shape  = input_valid_region.shape;
+
+    // Start of the valid region is equal to the start of the static access but
+    // never outside of the tensor.
+    anchor.set(0, std::max<int>(0, _start_x));
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(0, _start_y));
+    }
+
+    // End of the valid region is equal to the end of the static access but
+    // never outside of the tensor.
+    shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+    }
+
+    // For higher dimension use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
+{
+    if(_info != nullptr)
+    {
+        _info->set_valid_region(compute_valid_region(window, input_valid_region));
+    }
+}
+
+bool AccessWindowStatic::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    // Adjust window start for Y dimension
+    if(_start_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
+
+        if(_start_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(_start_y, front_pad_y_available, window.y().step());
+
+            window.set(1, Window::Dimension(start, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, -window.y().start());
+    }
+
+    // Adjust window end for Y dimension
+    if(_end_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(_end_y, shape[1] + tail_pad_y_available, window.y().step()) + window.y().step();
+            window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(_start_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(_start_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(_start_x, front_pad_x_available, window.x().step());
+            window.set(0, Window::Dimension(start, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, -window.x().start());
+    }
+
+    // Adjust window end for X dimension
+    if(_end_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(_end_x, shape[0] + tail_pad_x_available, window.x().step()) + window.x().step();
+            window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowStatic::update_padding_if_needed(const Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -_start_x);
+    padding.right  = std::max<int>(0, _end_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -_start_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, _end_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
new file mode 100644
index 0000000000..b3605c43f7
--- /dev/null
+++ b/src/core/AccessWindowTranspose.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    TensorShape &shape  = input_valid_region.shape;
+    Coordinates  old_anchor(anchor);
+    TensorShape  old_shape(shape);
+
+    if(!border_undefined)
+    {
+        border_size = BorderSize(0);
+    }
+
+    // Start of the valid region is equal to the start of the window. But it
+    // cannot be less than the start of the input's valid region plus the border
+    // size required by this kernel (if undefined).
+    // Additionally the valid region is shifted by the offset that is used by
+    // the kernel to write back output values.
+    // As the relation between input and output is transposed window.y() is
+    // used for x anchor and window.x() for y anchor.
+    anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    anchor.set(1, std::max<int>(window.x().start() * _scale_y, anchor[0] + border_size.left) + _y);
+
+    // End of the valid region is equal to the start of the last write of the
+    // kernel plus the number of written elements. (This assumes that all
+    // written elements are valid). Nevertheless the end cannot be larger than
+    // the end of the input's valid region minus the border size.
+    // Note: not the end points of the region are stored but its size. Thus the
+    // old size is first converted into end points to compared against the
+    // execution window. Afterwards the new end points are converted back into
+    // a size of the region.
+    // As the relation between input and output is transposed window.y() is
+    // used for x shape and window.x() for y shape.
+    shape.set(0, std::min<int>(old_anchor[1] + old_shape[1] - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+    shape.set(1, std::min<int>(old_anchor[0] + old_shape[0] - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+
+    // For higher dimensions use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+bool AccessWindowTranspose::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    // Transpose and scale
+    const int min_y = window.x().start() * _scale_y + _y;
+    const int max_y = window.x().end() * _scale_y + _y;
+
+    // Adjust window start for output's Y dimension (so X in (input) window)
+    if(min_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -offset_first_element / strides[1];
+
+        if(min_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
+
+            window.set(0, Window::Dimension(start / _scale_y, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, static_cast<int>(std::floor(-window.x().start() * _scale_y)) - _y);
+    }
+
+    // Adjust window end for Y dimension
+    if(max_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    // Transpose and scale
+    const int min_x = window.y().start() * _scale_x + _x;
+    const int max_x = window.y().end() * _scale_x + _x;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(min_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(min_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
+            window.set(1, Window::Dimension(start / _scale_x, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, static_cast<int>(std::floor(-window.y().start() * _scale_x)) - _x);
+    }
+
+    // Adjust window end for X dimension
+    if(max_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowTranspose::update_padding_if_needed(const Window &window) const
+{
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    ARM_COMPUTE_ERROR_ON(window.y().step() == 0);
+    ARM_COMPUTE_ERROR_ON(window.x().step() == 0);
+
+    const int min_x = window.y().start() * _scale_x + _x;
+    const int max_x = window.y().end() * _scale_x + _x;
+    const int min_y = window.x().start() * _scale_y + _y;
+    const int max_y = window.x().end() * _scale_y + _y;
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -min_x);
+    padding.right  = std::max<int>(0, max_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
new file mode 100644
index 0000000000..21b72ddd3b
--- /dev/null
+++ b/src/core/CL/CLHelpers.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <map>
+#include <vector>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "G7")
+    {
+        target = arm_compute::GPUTarget::G70;
+    }
+
+    return target;
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "T6")
+    {
+        target = arm_compute::GPUTarget::T600;
+    }
+    else if(name == "T7")
+    {
+        target = arm_compute::GPUTarget::T700;
+    }
+    else if(name == "T8")
+    {
+        target = arm_compute::GPUTarget::T800;
+    }
+
+    return target;
+}
+} // namespace
+
+namespace arm_compute
+{
+std::string get_cl_type_from_data_type(const DataType &dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return "uchar";
+        case DataType::S8:
+            return "char";
+        case DataType::U16:
+            return "ushort";
+        case DataType::S16:
+            return "short";
+        case DataType::U32:
+            return "uint";
+        case DataType::S32:
+            return "int";
+        case DataType::U64:
+            return "ulong";
+        case DataType::S64:
+            return "long";
+        case DataType::F16:
+            return "half";
+        case DataType::F32:
+            return "float";
+        default:
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+            return "";
+    }
+}
+
+const std::string &string_from_target(GPUTarget target)
+{
+    static std::map<GPUTarget, const std::string> gpu_target_map =
+    {
+        { GPUTarget::MIDGARD, "midgard" },
+        { GPUTarget::BIFROST, "bifrost" },
+        { GPUTarget::T600, "t600" },
+        { GPUTarget::T700, "t700" },
+        { GPUTarget::T800, "t800" },
+        { GPUTarget::G70, "g70" }
+    };
+
+    return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_device(cl::Device &device)
+{
+    const std::string name_mali("Mali-");
+    GPUTarget         target{ GPUTarget::MIDGARD };
+
+    size_t            name_size = 0;
+    std::vector<char> name;
+
+    // Query device name size
+    cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
+    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
+    // Resize vector
+    name.resize(name_size);
+    // Query device name
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    std::string name_str(name.begin(), name.end());
+    auto        pos = name_str.find(name_mali);
+
+    if(pos != std::string::npos)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
+        std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+
+        if(sub_name[0] == 'G')
+        {
+            target = get_bifrost_target(sub_name);
+        }
+        else if(sub_name[0] == 'T')
+        {
+            target = get_midgard_target(sub_name);
+        }
+        else
+        {
+            ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
+    }
+
+    return target;
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+    return (target & GPUTarget::GPU_ARCH_MASK);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 0000000000..15a5d90835
--- /dev/null
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+Program::Program()
+    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+{
+}
+
+Program::Program(cl::Context context, std::string name, std::string source)
+    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+{
+}
+
+Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
+    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+{
+}
+
+Program::operator cl::Program() const
+{
+    if(_is_binary)
+    {
+        return cl::Program(_context, { _device }, { _binary });
+    }
+    else
+    {
+        return cl::Program(_context, _source, false);
+    }
+}
+
+bool Program::build(const cl::Program &program, const std::string &build_options)
+{
+    try
+    {
+        return program.build(build_options.c_str()) == CL_SUCCESS;
+    }
+    catch(const cl::Error &e)
+    {
+        cl_int     err        = CL_SUCCESS;
+        const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
+
+        for(auto &pair : build_info)
+        {
+            std::cerr << pair.second << std::endl;
+        }
+
+        return false;
+    }
+}
+
+cl::Program Program::build(const std::string &build_options) const
+{
+    cl::Program cl_program = static_cast<cl::Program>(*this);
+    build(cl_program, build_options);
+    return cl_program;
+}
+
+Kernel::Kernel()
+    : _name(), _kernel()
+{
+}
+
+Kernel::Kernel(std::string name, const cl::Program &program)
+    : _name(std::move(name)),
+      _kernel(cl::Kernel(program, _name.c_str()))
+{
+}
+
+const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+{
+    { "absdiff", "absdiff.cl" },
+    { "accumulate", "accumulate.cl" },
+    { "accumulate_squared", "accumulate.cl" },
+    { "accumulate_weighted", "accumulate.cl" },
+    { "activation_layer", "activation_layer.cl" },
+    { "arithmetic_add", "arithmetic_op.cl" },
+    { "arithmetic_sub", "arithmetic_op.cl" },
+    { "bitwise_or", "bitwise_op.cl" },
+    { "bitwise_and", "bitwise_op.cl" },
+    { "bitwise_xor", "bitwise_op.cl" },
+    { "bitwise_not", "bitwise_op.cl" },
+    { "channel_combine_NV", "channel_combine.cl" },
+    { "channel_combine_RGB888", "channel_combine.cl" },
+    { "channel_combine_RGBA8888", "channel_combine.cl" },
+    { "channel_combine_UYVY422", "channel_combine.cl" },
+    { "channel_combine_YUYV422", "channel_combine.cl" },
+    { "channel_extract_NV12", "channel_extract.cl" },
+    { "channel_extract_NV21", "channel_extract.cl" },
+    { "channel_extract_RGB888", "channel_extract.cl" },
+    { "channel_extract_RGBA8888", "channel_extract.cl" },
+    { "channel_extract_UYVY422", "channel_extract.cl" },
+    { "channel_extract_YUYV422", "channel_extract.cl" },
+    { "combine_gradients_L1", "canny.cl" },
+    { "combine_gradients_L2", "canny.cl" },
+    { "concatenate_depth", "concatenate.cl" },
+    { "convolution_rectangle", "convolution_rectangle.cl" },
+    { "col2im", "convolution_layer.cl" },
+    { "convolution3x3_static", "convolution3x3.cl" },
+    { "convolution5x5_static", "convolution5x5.cl" },
+    { "convolution7x7_static", "convolution7x7.cl" },
+    { "convolution9x9_static", "convolution9x9.cl" },
+    { "convolution_separable1x5_static", "convolution5x5.cl" },
+    { "convolution_separable5x1_static", "convolution5x5.cl" },
+    { "convolution_separable1x7_static", "convolution7x7.cl" },
+    { "convolution_separable7x1_static", "convolution7x7.cl" },
+    { "convolution_separable1x9_static", "convolution9x9.cl" },
+    { "convolution_separable9x1_static", "convolution9x9.cl" },
+    { "convert_depth_down", "depth_convert.cl" },
+    { "convert_depth_up", "depth_convert.cl" },
+    { "copy_plane", "channel_extract.cl" },
+    { "copy_planes_3p", "channel_combine.cl" },
+    { "copy_to_keypoint", "fast_corners.cl" },
+    { "derivative", "derivative.cl" },
+    { "dilate", "dilate.cl" },
+    { "erode", "erode.cl" },
+    { "fast_corners", "fast_corners.cl" },
+    { "fill_image_borders_constant", "fill_border.cl" },
+    { "fill_image_borders_replicate", "fill_border.cl" },
+    { "finalize", "optical_flow_pyramid_lk.cl" },
+    { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
+    { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
+    { "gemm_accumulate_biases_f16", "gemm.cl" },
+    { "gemm_accumulate_biases_f32", "gemm.cl" },
+    { "gemm_interleave4x4_8bit", "gemm.cl" },
+    { "gemm_interleave4x4_16bit", "gemm.cl" },
+    { "gemm_interleave4x4_32bit", "gemm.cl" },
+    { "gemm_ma_f16", "gemm.cl" },
+    { "gemm_ma_f32", "gemm.cl" },
+    { "gemm_mm_u8", "gemm.cl" },
+    { "gemm_mm_f16", "gemm.cl" },
+    { "gemm_mm_f32_midgard", "gemm.cl" },
+    { "gemm_mm_f32_bifrost", "gemm.cl" },
+    { "gemm_vm_f16", "gemm.cl" },
+    { "gemm_vm_f32", "gemm.cl" },
+    { "gemm_lc_vm_f32", "gemm.cl" },
+    { "gemm_transpose1x16_u8", "gemm.cl" },
+    { "gemm_transpose1x8_f16", "gemm.cl" },
+    { "gemm_transpose1x4_f32", "gemm.cl" },
+    { "harris_score_3x3", "harris_corners.cl" },
+    { "harris_score_5x5", "harris_corners.cl" },
+    { "harris_score_7x7", "harris_corners.cl" },
+    { "hist_border_kernel", "histogram.cl" },
+    { "hist_border_kernel_fixed", "histogram.cl" },
+    { "hist_local_kernel", "histogram.cl" },
+    { "hist_local_kernel_fixed", "histogram.cl" },
+    { "hog_block_normalization", "hog.cl" },
+    { "hog_detector", "hog.cl" },
+    { "hog_orientation_binning", "hog.cl" },
+    { "hysteresis", "canny.cl" },
+    { "im2col_generic", "convolution_layer.cl" },
+    { "im2col_reduced", "convolution_layer.cl" },
+    { "init_level", "optical_flow_pyramid_lk.cl" },
+    { "init_level_max", "optical_flow_pyramid_lk.cl" },
+    { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" },
+    { "integral_horizontal", "integral_image.cl" },
+    { "integral_vertical", "integral_image.cl" },
+    { "IYUV_to_NV12_bt709", "color_convert.cl" },
+    { "IYUV_to_RGB888_bt709", "color_convert.cl" },
+    { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
+    { "IYUV_to_YUV444_bt709", "color_convert.cl" },
+    { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
+    { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
+    { "magnitude_phase", "magnitude_phase.cl" },
+    { "mean_stddev_accumulate", "mean_stddev.cl" },
+    { "minmax", "minmaxloc.cl" },
+    { "minmax_border", "minmaxloc.cl" },
+    { "minmaxloc", "minmaxloc.cl" },
+    { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" },
+    { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" },
+    { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
+    { "non_max_suppression", "nonmax.cl" },
+    { "normalization_layer_cross_map", "normalization_layer.cl" },
+    { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+    { "batchnormalization_layer", "batchnormalization_layer.cl" },
+    { "NV12_to_IYUV_bt709", "color_convert.cl" },
+    { "NV12_to_RGB888_bt709", "color_convert.cl" },
+    { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
+    { "NV12_to_YUV444_bt709", "color_convert.cl" },
+    { "NV21_to_IYUV_bt709", "color_convert.cl" },
+    { "NV21_to_RGB888_bt709", "color_convert.cl" },
+    { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
+    { "NV21_to_YUV444_bt709", "color_convert.cl" },
+    { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
+    { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
+    { "pooling_layer_2", "pooling_layer.cl" },
+    { "pooling_layer_3", "pooling_layer.cl" },
+    { "remap_nearest_neighbour", "remap.cl" },
+    { "remap_bilinear", "remap.cl" },
+    { "reshape_to_columns", "convolution_layer.cl" },
+    { "RGB888_to_IYUV_bt709", "color_convert.cl" },
+    { "RGB888_to_NV12_bt709", "color_convert.cl" },
+    { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
+    { "RGB888_to_YUV444_bt709", "color_convert.cl" },
+    { "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
+    { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
+    { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
+    { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+    { "scale_nearest_neighbour", "scale.cl" },
+    { "scale_bilinear", "scale.cl" },
+    { "scharr3x3", "scharr_filter.cl" },
+    { "sobel3x3", "sobel_filter.cl" },
+    { "sobel_separable5x1", "sobel_filter.cl" },
+    { "sobel_separable1x5", "sobel_filter.cl" },
+    { "sobel_separable7x1", "sobel_filter.cl" },
+    { "sobel_separable1x7", "sobel_filter.cl" },
+    { "softmax_layer_max", "softmax_layer.cl" },
+    { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
+    { "softmax_layer_norm", "softmax_layer.cl" },
+    { "suppress_non_maximum", "canny.cl" },
+    { "tablelookup_U8", "tablelookup.cl" },
+    { "tablelookup_S16", "tablelookup.cl" },
+    { "threshold_binary", "threshold.cl" },
+    { "threshold_range", "threshold.cl" },
+    { "transpose", "transpose.cl" },
+    { "UYVY422_to_IYUV_bt709", "color_convert.cl" },
+    { "UYVY422_to_NV12_bt709", "color_convert.cl" },
+    { "UYVY422_to_RGB888_bt709", "color_convert.cl" },
+    { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
+    { "warp_affine_nearest_neighbour", "warp_affine.cl" },
+    { "warp_affine_bilinear", "warp_affine.cl" },
+    { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
+    { "warp_perspective_bilinear", "warp_perspective.cl" },
+    { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
+    { "YUYV422_to_NV12_bt709", "color_convert.cl" },
+    { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
+    { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" },
+};
+
+const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+{
+#ifdef EMBEDDED_KERNELS
+    {
+        "absdiff.cl",
+#include "./cl_kernels/absdiff.clembed"
+    },
+    {
+        "accumulate.cl",
+#include "./cl_kernels/accumulate.clembed"
+    },
+    {
+        "activation_layer.cl",
+#include "./cl_kernels/activation_layer.clembed"
+    },
+    {
+        "arithmetic_op.cl",
+#include "./cl_kernels/arithmetic_op.clembed"
+    },
+    {
+        "bitwise_op.cl",
+#include "./cl_kernels/bitwise_op.clembed"
+    },
+    {
+        "canny.cl",
+#include "./cl_kernels/canny.clembed"
+    },
+    {
+        "channel_combine.cl",
+#include "./cl_kernels/channel_combine.clembed"
+    },
+    {
+        "channel_extract.cl",
+#include "./cl_kernels/channel_extract.clembed"
+    },
+    {
+        "concatenate.cl",
+#include "./cl_kernels/concatenate.clembed"
+    },
+    {
+        "color_convert.cl",
+#include "./cl_kernels/color_convert.clembed"
+    },
+    {
+        "convolution3x3.cl",
+#include "./cl_kernels/convolution3x3.clembed"
+    },
+    {
+        "convolution5x5.cl",
+#include "./cl_kernels/convolution5x5.clembed"
+    },
+    {
+        "convolution7x7.cl",
+#include "./cl_kernels/convolution7x7.clembed"
+    },
+    {
+        "convolution9x9.cl",
+#include "./cl_kernels/convolution9x9.clembed"
+    },
+    {
+        "convolution_layer.cl",
+#include "./cl_kernels/convolution_layer.clembed"
+    },
+    {
+        "convolution_rectangle.cl",
+#include "./cl_kernels/convolution_rectangle.clembed"
+    },
+    {
+        "depth_convert.cl",
+#include "./cl_kernels/depth_convert.clembed"
+    },
+    {
+        "derivative.cl",
+#include "./cl_kernels/derivative.clembed"
+    },
+    {
+        "dilate.cl",
+#include "./cl_kernels/dilate.clembed"
+    },
+    {
+        "erode.cl",
+#include "./cl_kernels/erode.clembed"
+    },
+    {
+        "fast_corners.cl",
+#include "./cl_kernels/fast_corners.clembed"
+    },
+    {
+        "fill_border.cl",
+#include "./cl_kernels/fill_border.clembed"
+    },
+    {
+        "gaussian_pyramid.cl",
+#include "./cl_kernels/gaussian_pyramid.clembed"
+    },
+    {
+        "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+    },
+    {
+        "harris_corners.cl",
+#include "./cl_kernels/harris_corners.clembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "histogram.cl",
+#include "./cl_kernels/histogram.clembed"
+    },
+    {
+        "hog.cl",
+#include "./cl_kernels/hog.clembed"
+    },
+    {
+        "integral_image.cl",
+#include "./cl_kernels/integral_image.clembed"
+    },
+    {
+        "magnitude_phase.cl",
+#include "./cl_kernels/magnitude_phase.clembed"
+    },
+    {
+        "mean_stddev.cl",
+#include "./cl_kernels/mean_stddev.clembed"
+    },
+    {
+        "minmaxloc.cl",
+#include "./cl_kernels/minmaxloc.clembed"
+    },
+    {
+        "non_linear_filter3x3.cl",
+#include "./cl_kernels/non_linear_filter3x3.clembed"
+    },
+    {
+        "non_linear_filter5x5.cl",
+#include "./cl_kernels/non_linear_filter5x5.clembed"
+    },
+    {
+        "non_linear_filter_helpers.h",
+#include "./cl_kernels/non_linear_filter_helpers.hembed"
+    },
+    {
+        "nonmax.cl",
+#include "./cl_kernels/nonmax.clembed"
+    },
+    {
+        "normalization_layer.cl",
+#include "./cl_kernels/normalization_layer.clembed"
+    },
+    {
+        "batchnormalization_layer.cl",
+#include "./cl_kernels/batchnormalization_layer.clembed"
+    },
+    {
+        "optical_flow_pyramid_lk.cl",
+#include "./cl_kernels/optical_flow_pyramid_lk.clembed"
+    },
+    {
+        "pixelwise_mul_float.cl",
+#include "./cl_kernels/pixelwise_mul_float.clembed"
+    },
+    {
+        "pixelwise_mul_int.cl",
+#include "./cl_kernels/pixelwise_mul_int.clembed"
+    },
+    {
+        "pooling_layer.cl",
+#include "./cl_kernels/pooling_layer.clembed"
+    },
+    {
+        "remap.cl",
+#include "./cl_kernels/remap.clembed"
+    },
+    {
+        "scale.cl",
+#include "./cl_kernels/scale.clembed"
+    },
+    {
+        "scharr_filter.cl",
+#include "./cl_kernels/scharr_filter.clembed"
+    },
+    {
+        "sobel_filter.cl",
+#include "./cl_kernels/sobel_filter.clembed"
+    },
+    {
+        "softmax_layer.cl",
+#include "./cl_kernels/softmax_layer.clembed"
+    },
+    {
+        "tablelookup.cl",
+#include "./cl_kernels/tablelookup.clembed"
+    },
+    {
+        "threshold.cl",
+#include "./cl_kernels/threshold.clembed"
+    },
+    {
+        "transpose.cl",
+#include "./cl_kernels/transpose.clembed"
+    },
+    {
+        "types.h",
+#include "./cl_kernels/types.hembed"
+    },
+    {
+        "warp_affine.cl",
+#include "./cl_kernels/warp_affine.clembed"
+    },
+    {
+        "warp_helpers.h",
+#include "./cl_kernels/warp_helpers.hembed"
+    },
+    {
+        "warp_perspective.cl",
+#include "./cl_kernels/warp_perspective.clembed"
+    }
+#endif
+};
+
+CLKernelLibrary::CLKernelLibrary()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+}
+
+CLKernelLibrary &CLKernelLibrary::get()
+{
+    static CLKernelLibrary _kernel_library;
+    return _kernel_library;
+}
+
+Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const StringSet &build_options_set) const
+{
+    // Find which program contains the kernel
+    auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+    if(_kernel_program_map.end() == kernel_program_it)
+    {
+        ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+    }
+
+    // Check if the program has been built before with same build options.
+    const std::string program_name       = kernel_program_it->second;
+    const std::string build_options      = stringify_set(build_options_set);
+    const std::string built_program_name = program_name + "_" + build_options;
+    auto              built_program_it   = _built_programs_map.find(built_program_name);
+
+    cl::Program cl_program;
+
+    if(_built_programs_map.end() != built_program_it)
+    {
+        // If program has been built, retrieve to create kernel from it
+        cl_program = built_program_it->second;
+    }
+    else
+    {
+        // Get program
+        Program program = load_program(program_name);
+
+        // Build program
+        cl_program = program.build(build_options);
+
+        // Add built program to internal map
+        _built_programs_map.emplace(built_program_name, cl_program);
+    }
+
+    // Create and return kernel
+    return Kernel(kernel_name, cl_program);
+}
+
+const Program &CLKernelLibrary::load_program(const std::string &program_name) const
+{
+    const auto program_it = _programs_map.find(program_name);
+
+    if(program_it != _programs_map.end())
+    {
+        return program_it->second;
+    }
+
+    Program program;
+
+#ifdef EMBEDDED_KERNELS
+    const auto program_source_it = _program_source_map.find(program_name);
+
+    if(_program_source_map.end() == program_source_it)
+    {
+        ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+
+    program = Program(_context, program_name, program_source_it->second);
+#else
+    // Check for binary
+    std::string source_name = _kernel_path + program_name;
+    std::string binary_name = source_name + "bin";
+
+    if(std::ifstream(binary_name).is_open())
+    {
+        const std::string program_binary = read_file(binary_name, true);
+        program                          = Program(_context, _device, program_name, std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+    }
+    else if(std::ifstream(source_name).is_open())
+    {
+        program = Program(_context, program_name, read_file(source_name, false));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+    }
+#endif
+
+    // Insert program to program map
+    const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+    return new_program.first->second;
+}
+
+std::string CLKernelLibrary::stringify_set(const StringSet &s) const
+{
+    std::string concat_set = "-cl-arm-non-uniform-work-group-size ";
+
+#ifndef EMBEDDED_KERNELS
+    concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+    // Concatenate set
+    for(const auto &el : s)
+    {
+        concat_set += " " + el;
+    }
+
+    return concat_set;
+}
diff --git a/src/core/CL/ICLDistribution1D.cpp b/src/core/CL/ICLDistribution1D.cpp
new file mode 100644
index 0000000000..a645d0ed71
--- /dev/null
+++ b/src/core/CL/ICLDistribution1D.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+ICLDistribution1D::ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : IDistribution1D(num_bins, offset, range), _mapping(nullptr)
+{
+}
+
+void ICLDistribution1D::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    _mapping = do_map(q, blocking);
+}
+
+void ICLDistribution1D::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+uint32_t *ICLDistribution1D::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp
new file mode 100644
index 0000000000..e1829971cf
--- /dev/null
+++ b/src/core/CL/ICLHOG.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLHOG.h"
+
+using namespace arm_compute;
+
+ICLHOG::ICLHOG()
+    : _mapping(nullptr)
+{
+}
+
+void ICLHOG::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLHOG::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+float *ICLHOG::descriptor() const
+{
+    return reinterpret_cast<float *>(_mapping);
+}
+\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
new file mode 100644
index 0000000000..7ac0fe3bbb
--- /dev/null
+++ b/src/core/CL/ICLKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint)
+{
+    if(kernel.kernel()() == nullptr)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+
+    cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
+                    (window.y().end() - window.y().start()) / window.y().step(),
+                    (window.z().end() - window.z().start()) / window.z().step());
+
+    cl::NDRange lws = cl::NullRange;
+
+    if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2]))
+    {
+        lws = lws_hint;
+    }
+
+    queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
+}
+
+ICLKernel::ICLKernel()
+    : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
+{
+}
+
+cl::Kernel &ICLKernel::kernel()
+{
+    return _kernel;
+}
+
+template <unsigned int dimension_size>
+unsigned int           ICLKernel::num_arguments_per_tensor() const
+{
+    return 2 + 2 * dimension_size;
+}
+
+template <unsigned int dimension_size>
+void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info    = tensor->info();
+    const Strides     &strides = info->strides_in_bytes();
+
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
+        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+
+    ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
+                             "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+
+void ICLKernel::add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, window);
+}
+
+void ICLKernel::add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, window);
+}
+
+void ICLKernel::add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, window);
+}
+
+unsigned int ICLKernel::num_arguments_per_1D_tensor() const
+{
+    return num_arguments_per_tensor<1>();
+}
+
+unsigned int ICLKernel::num_arguments_per_2D_tensor() const
+{
+    return num_arguments_per_tensor<2>();
+}
+
+unsigned int ICLKernel::num_arguments_per_3D_tensor() const
+{
+    return num_arguments_per_tensor<3>();
+}
+
+void ICLKernel::set_target(cl::Device &device)
+{
+    _target = get_target_from_device(device);
+}
+
+void ICLKernel::set_target(GPUTarget target)
+{
+    _target = target;
+}
+
+GPUTarget ICLKernel::get_target() const
+{
+    return _target;
+}
diff --git a/src/core/CL/ICLLut.cpp b/src/core/CL/ICLLut.cpp
new file mode 100644
index 0000000000..ea9deac6dc
--- /dev/null
+++ b/src/core/CL/ICLLut.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLLut.h"
+
+using namespace arm_compute;
+
+ICLLut::ICLLut()
+    : _mapping(nullptr)
+{
+}
+
+void ICLLut::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLLut::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+uint8_t *ICLLut::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp
new file mode 100644
index 0000000000..8ece566e83
--- /dev/null
+++ b/src/core/CL/ICLMultiHOG.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+
+#include "arm_compute/core/IHOG.h"
+
+using namespace arm_compute;
+
+IHOG *ICLMultiHOG::model(size_t index)
+{
+    return cl_model(index);
+}
+
+const IHOG *ICLMultiHOG::model(size_t index) const
+{
+    return cl_model(index);
+}
+\ No newline at end of file
diff --git a/src/core/CL/ICLMultiImage.cpp b/src/core/CL/ICLMultiImage.cpp
new file mode 100644
index 0000000000..dbf3fe3e6f
--- /dev/null
+++ b/src/core/CL/ICLMultiImage.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiImage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensor.h"
+
+using namespace arm_compute;
+
+IImage *ICLMultiImage::plane(unsigned int index)
+{
+    return cl_plane(index);
+}
+
+const IImage *ICLMultiImage::plane(unsigned int index) const
+{
+    return cl_plane(index);
+}
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
new file mode 100644
index 0000000000..5dc3e6c8bb
--- /dev/null
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
new file mode 100644
index 0000000000..7b0d011b3e
--- /dev/null
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
new file mode 100644
index 0000000000..fec9d923da
--- /dev/null
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ICLSimpleKernel::ICLSimpleKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
new file mode 100644
index 0000000000..4a7952e108
--- /dev/null
+++ b/src/core/CL/ICLTensor.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+ICLTensor::ICLTensor()
+    : _mapping(nullptr)
+{
+}
+
+void ICLTensor::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLTensor::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+void ICLTensor::clear(cl::CommandQueue &q)
+{
+    this->map(q);
+    std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size());
+    this->unmap(q);
+}
+
+uint8_t *ICLTensor::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
new file mode 100644
index 0000000000..3b8dfd2465
--- /dev/null
+++ b/src/core/CL/OpenCL.cpp
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <dlfcn.h>
+#include <iostream>
+
+using clBuildProgram_func            = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *);
+using clEnqueueNDRangeKernel_func    = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+using clSetKernelArg_func            = cl_int (*)(cl_kernel, cl_uint, size_t, const void *);
+using clReleaseMemObject_func        = cl_int (*)(cl_mem);
+using clEnqueueUnmapMemObject_func   = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+using clRetainCommandQueue_func      = cl_int (*)(cl_command_queue command_queue);
+using clReleaseContext_func          = cl_int (*)(cl_context);
+using clReleaseEvent_func            = cl_int (*)(cl_event);
+using clEnqueueWriteBuffer_func      = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+using clEnqueueReadBuffer_func       = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+using clGetProgramBuildInfo_func     = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+using clRetainProgram_func           = cl_int (*)(cl_program program);
+using clEnqueueMapBuffer_func        = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+using clReleaseCommandQueue_func     = cl_int (*)(cl_command_queue);
+using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+using clRetainContext_func           = cl_int (*)(cl_context context);
+using clReleaseProgram_func          = cl_int (*)(cl_program program);
+using clFlush_func                   = cl_int (*)(cl_command_queue command_queue);
+using clGetProgramInfo_func          = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
+using clCreateKernel_func            = cl_kernel (*)(cl_program, const char *, cl_int *);
+using clRetainKernel_func            = cl_int (*)(cl_kernel kernel);
+using clCreateBuffer_func            = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+using clReleaseKernel_func           = cl_int (*)(cl_kernel kernel);
+using clGetDeviceInfo_func           = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+using clGetDeviceIDs_func            = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+
+class CLSymbols
+{
+private:
+    CLSymbols()
+    {
+        void *handle = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_LOCAL);
+        if(handle == nullptr)
+        {
+            std::cerr << "Can't load libOpenCL.so: " << dlerror() << std::endl;
+        }
+        else
+        {
+            clBuildProgram            = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
+            clEnqueueNDRangeKernel    = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
+            clSetKernelArg            = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
+            clReleaseKernel           = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
+            clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
+            clCreateBuffer            = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
+            clRetainKernel            = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
+            clCreateKernel            = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
+            clGetProgramInfo          = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
+            clFlush                   = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
+            clReleaseProgram          = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
+            clRetainContext           = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
+            clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
+            clReleaseCommandQueue     = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
+            clEnqueueMapBuffer        = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
+            clRetainProgram           = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
+            clGetProgramBuildInfo     = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
+            clEnqueueReadBuffer       = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
+            clEnqueueWriteBuffer      = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
+            clReleaseEvent            = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
+            clReleaseContext          = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
+            clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
+            clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
+            clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+            clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+            clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
+            dlclose(handle);
+        }
+    }
+
+public:
+    static CLSymbols &get()
+    {
+        static CLSymbols symbols = CLSymbols();
+        return symbols;
+    }
+
+    clBuildProgram_func            clBuildProgram            = nullptr;
+    clEnqueueNDRangeKernel_func    clEnqueueNDRangeKernel    = nullptr;
+    clSetKernelArg_func            clSetKernelArg            = nullptr;
+    clReleaseKernel_func           clReleaseKernel           = nullptr;
+    clCreateProgramWithSource_func clCreateProgramWithSource = nullptr;
+    clCreateBuffer_func            clCreateBuffer            = nullptr;
+    clRetainKernel_func            clRetainKernel            = nullptr;
+    clCreateKernel_func            clCreateKernel            = nullptr;
+    clGetProgramInfo_func          clGetProgramInfo          = nullptr;
+    clFlush_func                   clFlush                   = nullptr;
+    clReleaseProgram_func          clReleaseProgram          = nullptr;
+    clRetainContext_func           clRetainContext           = nullptr;
+    clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr;
+    clReleaseCommandQueue_func     clReleaseCommandQueue     = nullptr;
+    clEnqueueMapBuffer_func        clEnqueueMapBuffer        = nullptr;
+    clRetainProgram_func           clRetainProgram           = nullptr;
+    clGetProgramBuildInfo_func     clGetProgramBuildInfo     = nullptr;
+    clEnqueueReadBuffer_func       clEnqueueReadBuffer       = nullptr;
+    clEnqueueWriteBuffer_func      clEnqueueWriteBuffer      = nullptr;
+    clReleaseEvent_func            clReleaseEvent            = nullptr;
+    clReleaseContext_func          clReleaseContext          = nullptr;
+    clRetainCommandQueue_func      clRetainCommandQueue      = nullptr;
+    clEnqueueUnmapMemObject_func   clEnqueueUnmapMemObject   = nullptr;
+    clReleaseMemObject_func        clReleaseMemObject        = nullptr;
+    clGetDeviceInfo_func           clGetDeviceInfo           = nullptr;
+    clGetDeviceIDs_func            clGetDeviceIDs            = nullptr;
+};
+
+bool arm_compute::opencl_is_available()
+{
+    return CLSymbols::get().clBuildProgram != nullptr;
+}
+
+cl_int clBuildProgram(
+    cl_program          program,
+    cl_uint             num_devices,
+    const cl_device_id *device_list,
+    const char         *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data)
+{
+    auto func = CLSymbols::get().clBuildProgram;
+    if(func != nullptr)
+    {
+        return func(program, num_devices, device_list, options, pfn_notify, user_data);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueNDRangeKernel(
+    cl_command_queue command_queue,
+    cl_kernel        kernel,
+    cl_uint          work_dim,
+    const size_t    *global_work_offset,
+    const size_t    *global_work_size,
+    const size_t    *local_work_size,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueNDRangeKernel;
+    if(func != nullptr)
+    {
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clSetKernelArg(
+    cl_kernel   kernel,
+    cl_uint     arg_index,
+    size_t      arg_size,
+    const void *arg_value)
+{
+    auto func = CLSymbols::get().clSetKernelArg;
+    if(func != nullptr)
+    {
+        return func(kernel, arg_index, arg_size, arg_value);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseMemObject(cl_mem memobj)
+{
+    auto func = CLSymbols::get().clReleaseMemObject;
+    if(func != nullptr)
+    {
+        return func(memobj);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueUnmapMemObject(
+    cl_command_queue command_queue,
+    cl_mem           memobj,
+    void            *mapped_ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueUnmapMemObject;
+    if(func != nullptr)
+    {
+        return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clRetainCommandQueue(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clRetainCommandQueue;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseContext(cl_context context)
+{
+    auto func = CLSymbols::get().clReleaseContext;
+    if(func != nullptr)
+    {
+        return func(context);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+cl_int clReleaseEvent(cl_event event)
+{
+    auto func = CLSymbols::get().clReleaseEvent;
+    if(func != nullptr)
+    {
+        return func(event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueWriteBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_write,
+    size_t           offset,
+    size_t           size,
+    const void      *ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueWriteBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueReadBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_read,
+    size_t           offset,
+    size_t           size,
+    void            *ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueReadBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetProgramBuildInfo(
+    cl_program            program,
+    cl_device_id          device,
+    cl_program_build_info param_name,
+    size_t                param_value_size,
+    void                 *param_value,
+    size_t               *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetProgramBuildInfo;
+    if(func != nullptr)
+    {
+        return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clRetainProgram(cl_program program)
+{
+    auto func = CLSymbols::get().clRetainProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+void *clEnqueueMapBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_map,
+    cl_map_flags     map_flags,
+    size_t           offset,
+    size_t           size,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event,
+    cl_int          *errcode_ret)
+{
+    auto func = CLSymbols::get().clEnqueueMapBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clReleaseCommandQueue(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clReleaseCommandQueue;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_program clCreateProgramWithBinary(
+    cl_context            context,
+    cl_uint               num_devices,
+    const cl_device_id   *device_list,
+    const size_t         *lengths,
+    const unsigned char **binaries,
+    cl_int               *binary_status,
+    cl_int               *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateProgramWithBinary;
+    if(func != nullptr)
+    {
+        return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clRetainContext(cl_context context)
+{
+    auto func = CLSymbols::get().clRetainContext;
+    if(func != nullptr)
+    {
+        return func(context);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseProgram(cl_program program)
+{
+    auto func = CLSymbols::get().clReleaseProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clFlush(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clFlush;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetProgramInfo(
+    cl_program      program,
+    cl_program_info param_name,
+    size_t          param_value_size,
+    void           *param_value,
+    size_t         *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetProgramInfo;
+    if(func != nullptr)
+    {
+        return func(program, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_kernel clCreateKernel(
+    cl_program  program,
+    const char *kernel_name,
+    cl_int     *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateKernel;
+    if(func != nullptr)
+    {
+        return func(program, kernel_name, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clRetainKernel(cl_kernel kernel)
+{
+    auto func = CLSymbols::get().clRetainKernel;
+    if(func != nullptr)
+    {
+        return func(kernel);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_mem clCreateBuffer(
+    cl_context   context,
+    cl_mem_flags flags,
+    size_t       size,
+    void        *host_ptr,
+    cl_int      *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateBuffer;
+    if(func != nullptr)
+    {
+        return func(context, flags, size, host_ptr, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_program clCreateProgramWithSource(
+    cl_context    context,
+    cl_uint       count,
+    const char **strings,
+    const size_t *lengths,
+    cl_int       *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateProgramWithSource;
+    if(func != nullptr)
+    {
+        return func(context, count, strings, lengths, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clReleaseKernel(cl_kernel kernel)
+{
+    auto func = CLSymbols::get().clReleaseKernel;
+    if(func != nullptr)
+    {
+        return func(kernel);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetDeviceIDs(cl_platform_id platform,
+                      cl_device_type device_type,
+                      cl_uint        num_entries,
+                      cl_device_id *devices,
+                      cl_uint       *num_devices)
+{
+    auto func = CLSymbols::get().clGetDeviceIDs;
+    if(func != nullptr)
+    {
+        return func(platform, device_type, num_entries, devices, num_devices);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetDeviceInfo(cl_device_id   device,
+                       cl_device_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetDeviceInfo;
+    if(func != nullptr)
+    {
+        return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
new file mode 100644
index 0000000000..1761342eb4
--- /dev/null
+++ b/src/core/CL/cl_kernels/absdiff.cl
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void absdiff(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
new file mode 100644
index 0000000000..39c1512c3c
--- /dev/null
+++ b/src/core/CL/cl_kernels/accumulate.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function accumulates an input image into output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ */
+__kernel void accumulate(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu))
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    uchar16 in_data   = vload16(0, input.ptr);
+    short16 accu_data = vload16(0, (__global short *)accu.ptr);
+
+    // Perform accumulation
+    short16 res = add_sat(convert_short16(in_data), accu_data);
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
+
+/** This function accumulates a weighted value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
+ */
+__kernel void accumulate_weighted(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const float alpha)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    const float16 in_data   = convert_float16(vload16(0, input.ptr));
+    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
+
+    // Calculate weighted accumulation
+    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
+
+    // Store result
+    vstore16(res, 0, accu.ptr);
+}
+
+/** This function accumulates a squared value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
+ */
+__kernel void accumulate_squared(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const uint shift)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
+    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
+
+    // Calculate squared accumulation
+    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
new file mode 100644
index 0000000000..e3cbb6c801
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
+ * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
+ * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+#if defined LOGISTIC
+    data = 1 / (1 + exp(-data));
+#elif defined TANH
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
+#elif defined RELU
+    data = max(0, data);
+#elif defined BRELU
+    data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data));
+#elif defined SRELU
+    data = log(1 + exp(data));
+#elif defined ABS
+#if defined   TYPE_INT
+    data = abs(data);
+#else
+    data = fabs(data);
+#endif
+#elif defined SQUARE
+    data = data * data;
+#elif defined SQRT
+    data = sqrt(data);
+#elif defined LINEAR
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B;
+#endif
+
+    // Store result
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
new file mode 100644
index 0000000000..434300efa8
--- /dev/null
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif
+
+/** This function add two images.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_add(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function subtracts one image from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_sub(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
new file mode 100644
index 0000000000..13e6702334
--- /dev/null
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: F32
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: F32
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: F32
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+                                       TENSOR3D_DECLARATION(output),
+                                       VECTOR_DECLARATION(mean),
+                                       VECTOR_DECLARATION(var),
+                                       VECTOR_DECLARATION(beta),
+                                       VECTOR_DECLARATION(gamma),
+                                       float epsilon)
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+    float4 _in         = 0;
+    float4 denominator = 0;
+    float4 numerator   = 0;
+    float4 x_bar       = 0;
+    float4 gamma_vec   = 0;
+    float4 beta_vec    = 0;
+
+    const int current_slice = get_global_id(2);
+
+    _in         = vload4(0, (__global float *)in.ptr);
+    denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
+    denominator = rsqrt(denominator + epsilon);
+
+    // Calculate x bar and store results
+    numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = _in - numerator;
+    x_bar     = numerator * denominator;
+
+    gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
+    beta_vec  = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+
+    vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
new file mode 100644
index 0000000000..135bfa989c
--- /dev/null
+++ b/src/core/CL/cl_kernels/bitwise_op.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a | in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a & in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a ^ in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise NOT of an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_data = vload16(0, in.ptr);
+
+    vstore16(~in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
new file mode 100644
index 0000000000..ec6719213c
--- /dev/null
+++ b/src/core/CL/cl_kernels/canny.cl
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 1 normalisation.
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L1(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
+
+    /* Calculate the gradient, using level 1 normalisation method */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
+    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
+
+    /* Calculate the angle */
+    float4 p = atan2pi(convert_float4(v), convert_float4(h));
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 2 normalisation
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L2(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
+    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
+
+    /* Calculate the gradient, using level 2 normalisation method */
+    float4 m = sqrt(h * h + v * v);
+
+    /* Calculate the angle */
+    float4 p = atan2pi(v, h);
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Array that holds the relative coordinates offset for the neighbouring pixels.
+ */
+__constant short4 neighbours_coords[] =
+{
+    { -1, 0, 1, 0 },  // 0
+    { -1, 1, 1, -1 }, // 45
+    { 0, 1, 0, -1 },  // 90
+    { 1, 1, -1, -1 }, // 135
+    { 1, 0, -1, 0 },  // 180
+    { 1, -1, -1, 1 }, // 225
+    { 0, 1, 0, -1 },  // 270
+    { -1, -1, 1, 1 }, // 315
+    { -1, 0, 1, 0 },  // 360
+};
+
+/** Perform non maximum suppression.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
+ * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
+ * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
+ * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
+ * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  lower_thr                             The low threshold
+ */
+__kernel void suppress_non_maximum(
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle),
+    IMAGE_DECLARATION(non_max),
+    uint lower_thr)
+{
+    // Construct images
+    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
+    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
+
+    // Get gradient and angle
+    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
+    uchar an              = convert_ushort(*angle.ptr);
+
+    if(gradient <= lower_thr)
+    {
+        return;
+    }
+
+    // Divide the whole round into 8 directions
+    uchar         ang  = 127 - an;
+    DATA_TYPE_OUT q_an = (ang + 16) >> 5;
+
+    // Find the two pixels in the perpendicular direction
+    short2       x_p = neighbours_coords[q_an].s02;
+    short2       y_p = neighbours_coords[q_an].s13;
+    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
+    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
+
+    if((gradient > g1) && (gradient > g2))
+    {
+        *((global DATA_TYPE_OUT *)non_max.ptr) = gradient;
+    }
+}
+
+#define EDGE 255
+#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
+
+/** Check whether pixel is valid
+*
+* Skip the pixel if the early_test fails.
+* Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
+*
+* @param[in] early_test Boolean condition based on the minv check and visited buffer check
+* @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] x_cur      X-coordinate of current central pixel
+* @param[in] y_cur      Y-coordinate of current central pixel
+*/
+#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
+    {                                                                                     \
+        if(!early_test)                                                                   \
+        {                                                                                 \
+            /* Number of elements in the local stack 1, points to next available entry */ \
+            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
+            \
+            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
+                goto pop_stack;                                                           \
+            \
+            /* The pixel that has already been recorded is ignored */                     \
+            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
+            {                                                                             \
+                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
+                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
+            }                                                                             \
+        }                                                                                 \
+    }
+
+/** Perform hysteresis.
+ *
+ * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
+ *
+ * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
+ * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
+ * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
+ * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
+ * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
+ * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
+ * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
+ * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
+ * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
+ * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  low_thr                                        The lower threshold
+ * @param[in]  up_thr                                         The upper threshold
+ * @param[in]  width                                          The width of the image.
+ * @param[in]  height                                         The height of the image
+ */
+kernel void hysteresis(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(visited),
+    IMAGE_DECLARATION(recorded),
+    IMAGE_DECLARATION(l1_stack),
+    IMAGE_DECLARATION(l1_stack_counter),
+    uint low_thr,
+    uint up_thr,
+    int  width,
+    int  height)
+{
+    // Create images
+    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
+    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
+    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
+    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
+    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
+
+    // Index
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Load value
+    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
+
+    // If less than upper threshold set to NO_EDGE and return
+    if(val <= up_thr)
+    {
+        *offset(&out, x, y) = 0;
+        return;
+    }
+
+    // Init local stack 2
+    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
+    int    L2_counter                          = 0;
+
+    // Perform recursive hysteresis
+    while(true)
+    {
+        // Get L1 stack pointer
+        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
+
+        // If the pixel has already been visited, proceed with the items in the stack instead
+        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
+        {
+            goto pop_stack;
+        }
+
+        // Set strong edge
+        *offset(&out, x, y) = EDGE;
+
+        // If it is the top of stack l2, we don't need check the surrounding pixels
+        if(L2_counter > (hysteresis_local_stack_L2 - 1))
+        {
+            goto pop_stack2;
+        }
+
+        // Points to the start of the local stack;
+        char c;
+
+        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+        x_tmp;
+        uint4 v_tmp;
+
+        // Get direction pixel indices
+        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
+
+        // Check 8 pixels around for week edges where low_thr < val <= up_thr
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
+
+#undef check_pixel
+
+pop_stack:
+        c = *((__global char *)offset(&l1_stack_counter, x, y));
+
+        if(c >= 1)
+        {
+            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
+            int2 l_c = convert_int2(l1_ptr[c - 1]);
+
+            // Push the current position into level 2 stack
+            stack_L2[L2_counter].x = x;
+            stack_L2[L2_counter].y = y;
+
+            x = l_c.x;
+            y = l_c.y;
+
+            L2_counter++;
+
+            continue;
+        }
+
+        if(L2_counter > 0)
+        {
+            goto pop_stack2;
+        }
+        else
+        {
+            return;
+        }
+
+pop_stack2:
+        L2_counter--;
+        x = stack_L2[L2_counter].x;
+        y = stack_L2[L2_counter].y;
+    };
+}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
new file mode 100644
index 0000000000..93e80b925e
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_combine.cl
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function combines three planes to a single RGB image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGB888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
+                             data0.s1, data1.s1, data2.s1,
+                             data0.s2, data1.s2, data2.s2,
+                             data0.s3, data1.s3, data2.s3,
+                             data0.s4, data1.s4, data2.s4,
+                             data0.s5);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
+                             data1.s6, data2.s6, data0.s7,
+                             data1.s7, data2.s7, data0.s8,
+                             data1.s8, data2.s8, data0.s9,
+                             data1.s9, data2.s9, data0.sA,
+                             data1.sA);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
+                             data2.sB, data0.sC, data1.sC,
+                             data2.sC, data0.sD, data1.sD,
+                             data2.sD, data0.sE, data1.sE,
+                             data2.sE, data0.sF, data1.sF,
+                             data2.sF);
+    vstore16(out2, 0, dst.ptr + 32);
+}
+
+/** This function combines three planes to a single RGBA image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
+ * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
+ * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
+ * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGBA8888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(plane3),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+    uchar16 data3 = vload16(0, plane3.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
+                             data0.s1, data1.s1, data2.s1, data3.s1,
+                             data0.s2, data1.s2, data2.s2, data3.s2,
+                             data0.s3, data1.s3, data2.s3, data3.s3);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
+                             data0.s5, data1.s5, data2.s5, data3.s5,
+                             data0.s6, data1.s6, data2.s6, data3.s6,
+                             data0.s7, data1.s7, data2.s7, data3.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
+                             data0.s9, data1.s9, data2.s9, data3.s9,
+                             data0.sA, data1.sA, data2.sA, data3.sA,
+                             data0.sB, data1.sB, data2.sB, data3.sB);
+    vstore16(out2, 0, dst.ptr + 32);
+
+    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
+                             data0.sD, data1.sD, data2.sD, data3.sD,
+                             data0.sE, data1.sE, data2.sE, data3.sE,
+                             data0.sF, data1.sF, data2.sF, data3.sF);
+    vstore16(out3, 0, dst.ptr + 48);
+}
+
+/** This function combines three planes to a single YUYV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_YUYV422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
+                             data0.s2, data1.s1, data0.s3, data2.s1,
+                             data0.s4, data1.s2, data0.s5, data2.s2,
+                             data0.s6, data1.s3, data0.s7, data2.s3);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
+                             data0.sA, data1.s5, data0.sB, data2.s5,
+                             data0.sC, data1.s6, data0.sD, data2.s6,
+                             data0.sE, data1.s7, data0.sF, data2.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single UYUV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_UYVY422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
+                             data1.s1, data0.s2, data2.s1, data0.s3,
+                             data1.s2, data0.s4, data2.s2, data0.s5,
+                             data1.s3, data0.s6, data2.s3, data0.s7);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
+                             data1.s5, data0.sA, data2.s5, data0.sB,
+                             data1.s6, data0.sC, data2.s6, data0.sD,
+                             data1.s7, data0.sE, data2.s7, data0.sF);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single NV12/NV21 image.
+ *
+ * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void channel_combine_NV(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+
+    // Create UV place
+    uchar8 data1 = vload8(0, src_plane1.ptr);
+    uchar8 data2 = vload8(0, src_plane2.ptr);
+
+#if defined NV12
+    vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#elif defined NV21
+    vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#endif
+}
+
+/** This function combines three planes to a single YUV444 or IYUV image.
+ *
+ * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void copy_planes_3p(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    IMAGE_DECLARATION(dst_plane2),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+#if defined YUV444
+    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#elif defined IYUV
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
new file mode 100644
index 0000000000..14c6c8a92a
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_extract.cl
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function extracts a given channel from an RGB image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGB888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar8  data2 = vload8(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore4(data.s0369, 0, dst.ptr);
+    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
+#elif defined CHANNEL_G
+    vstore4(data.s147A, 0, dst.ptr);
+    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
+#elif defined CHANNEL_B
+    vstore4(data.s258B, 0, dst.ptr);
+    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
+#endif
+}
+
+/** This function extracts a given channel from an RGBA image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGBA8888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar16 data2 = vload16(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
+#elif defined CHANNEL_G
+    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
+#elif defined CHANNEL_B
+    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
+#elif defined CHANNEL_A
+    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an YUYV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_YUYV422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s159D, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s37BF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an UYUV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_UYVY422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s048C, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s26AE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV12 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV12(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV21 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV21(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given plane from an multi-planar image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void copy_plane(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Copy plane data
+    vstore16(vload16(0, src.ptr), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
new file mode 100644
index 0000000000..f5ec85ae76
--- /dev/null
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -0,0 +1,1823 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    uchar16 rgb_0 = vload16(0, in.ptr);
+    uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
+    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
+    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+    vstore16(rgba_2, 0, out.ptr + 32);
+    vstore16(rgba_3, 0, out.ptr + 48);
+}
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGBA8888_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+    // handle 16 pixels every time
+    uchar16 rgba_0 = vload16(0, in.ptr);
+    uchar16 rgba_1 = vload16(0, in.ptr + 16);
+    uchar16 rgba_2 = vload16(0, in.ptr + 32);
+    uchar16 rgba_3 = vload16(0, in.ptr + 48);
+
+    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
+    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
+    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore16(rgb_1, 0, out.ptr + 16);
+    vstore16(rgb_2, 0, out.ptr + 32);
+}
+
+/** Convert a UYVY422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a UYVY422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a RGB image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGB888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+*/
+
+/** Convert an NV12 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert a RGB image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4_rtz(f_y);
+    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
+    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGB image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert a RGBA image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4(f_y);
+    short4 i_u = convert_short4(f_u) + (short4)(128);
+    short4 i_v = convert_short4(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGBA image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGBA8888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a RGBA image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert an NV12 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an NV21 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert a UYVY image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void UYVY422_to_IYUV_bt709(
+    IMAGE_DECLARATION(uyvy_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 uyvy = vload16(0, in_uyvy.ptr);
+    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
+    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert a YUYV image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void YUYV422_to_IYUV_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv = vload16(0, in_yuyv.ptr);
+    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert an IYUV image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void IYUV_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb_src = vload8(0, in_u.ptr);
+    uchar8  cr_src = vload8(0, in_v.ptr);
+    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
+                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
+    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
+                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an IYUV image to NV12
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void IYUV_to_NV12_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb     = vload8(0, in_u.ptr);
+    uchar8  cr     = vload8(0, in_v.ptr);
+    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
+                               cr.s6, cb.s7, cr.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a YUYV image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void YUYV422_to_NV12_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
+    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a UYVY image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void UYVY422_to_NV12_bt709(
+    IMAGE_DECLARATION(input_uyvy),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 16 pixels every time, each line 8 pixels
+    const uchar16 uyvy_t = vload16(0, in.ptr);
+    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
+
+    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
+    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
+
+    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
+    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
+    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
new file mode 100644
index 0000000000..00f5189508
--- /dev/null
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_depth(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    unsigned int offset)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    float4 source_values = vload4(0, (__global float *)src.ptr);
+
+    vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+}
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
new file mode 100644
index 0000000000..3733d0c733
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution3x3.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
+                                                  const short left_coeff,
+                                                  const short middle_coeff,
+                                                  const short right_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src   A pointer to source Image structure
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2,
+    const short mat3, const short mat4, const short mat5,
+    const short mat6, const short mat7, const short mat8, uint scale)
+{
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    // Row 0
+    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
+    // Row
+    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
+    // Row 2
+    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
+
+    // Divide by the scale
+    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
+ * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution3x3_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution3x3(&src,
+                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
new file mode 100644
index 0000000000..d1335c5558
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution5x5.cl
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x5(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
+           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    return out;
+}
+
+/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
+ * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
+ * [ mat10, mat11, mat12, mat13, mat14 ]\n
+ * [ mat15, mat16, mat17, mat18, mat19 ]\n
+ * [ mat20, mat21, mat22, mat23, mat24 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+short8 convolution5x5(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
+    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
+    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
+    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
+    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable5x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution5x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution5x5(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
new file mode 100644
index 0000000000..74a0055370
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution7x7.cl
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x7(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
+            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution7x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    return out;
+}
+
+/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
+ * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
+ * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
+ * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
+ * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
+ * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
+ * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution7x7(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
+    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
+    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
+    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
+    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
+    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
+    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable7x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution7x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution7x7(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
+
+    // Clamp results to [ 0, 255 ] and store them in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
new file mode 100644
index 0000000000..d8b07cafac
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution9x9.cl
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the third left pixel
+ * @param[in] left4_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the third right pixel
+ * @param[in] right4_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x9(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           left4_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff,
+    const short           right4_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
+            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
+                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the third up pixel
+ * @param[in] up4_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ * @param[in] down4_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution9x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short up4_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff,
+    const short down4_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
+
+    return out;
+}
+
+/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
+ * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
+ * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
+ * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
+ * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
+ * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
+ * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
+ * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
+ * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] mat49 Coefficient from the convolution matrix
+ * @param[in] mat50 Coefficient from the convolution matrix
+ * @param[in] mat51 Coefficient from the convolution matrix
+ * @param[in] mat52 Coefficient from the convolution matrix
+ * @param[in] mat53 Coefficient from the convolution matrix
+ * @param[in] mat54 Coefficient from the convolution matrix
+ * @param[in] mat55 Coefficient from the convolution matrix
+ * @param[in] mat56 Coefficient from the convolution matrix
+ * @param[in] mat57 Coefficient from the convolution matrix
+ * @param[in] mat58 Coefficient from the convolution matrix
+ * @param[in] mat59 Coefficient from the convolution matrix
+ * @param[in] mat60 Coefficient from the convolution matrix
+ * @param[in] mat61 Coefficient from the convolution matrix
+ * @param[in] mat62 Coefficient from the convolution matrix
+ * @param[in] mat63 Coefficient from the convolution matrix
+ * @param[in] mat64 Coefficient from the convolution matrix
+ * @param[in] mat65 Coefficient from the convolution matrix
+ * @param[in] mat66 Coefficient from the convolution matrix
+ * @param[in] mat67 Coefficient from the convolution matrix
+ * @param[in] mat68 Coefficient from the convolution matrix
+ * @param[in] mat69 Coefficient from the convolution matrix
+ * @param[in] mat70 Coefficient from the convolution matrix
+ * @param[in] mat71 Coefficient from the convolution matrix
+ * @param[in] mat72 Coefficient from the convolution matrix
+ * @param[in] mat73 Coefficient from the convolution matrix
+ * @param[in] mat74 Coefficient from the convolution matrix
+ * @param[in] mat75 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat77 Coefficient from the convolution matrix
+ * @param[in] mat78 Coefficient from the convolution matrix
+ * @param[in] mat79 Coefficient from the convolution matrix
+ * @param[in] mat80 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution9x9(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
+    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
+    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
+    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
+    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
+    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
+    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
+    const short mat80, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
+    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
+    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
+    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
+    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
+    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
+    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
+    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
+    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable9x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
+
+    // Divide by the scale
+    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution9x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution9x9(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
+                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
+                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
+                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
new file mode 100644
index 0000000000..bd5dfaff68
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as input
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as input
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#if defined HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif
+    uint width, uint height, uint depth, uint total_filters)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#if defined         HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif
+
+    if(is_last_thread)
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#if defined HAS_BIAS
+            *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+            tmp_bias_ptr += bias_stride_x;
+#endif
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+    else
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+}
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  kernel_size                       The convolution kernel size
+ * @param[in]  kernel_depth                      The kernel depth
+ * @param[in]  width                             The output tensor width
+ * @param[in]  input_dims                        The input tensor dimensions
+ * @param[in]  strides                           The strides of the im2col operation
+ * @param[in]  paddings                          The input tensor paddings
+ */
+__kernel void im2col_generic(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    int  kernel_size,
+    int  kernel_depth,
+    int  width,
+    int2 input_dims,
+    int2 strides,
+    int2 paddings)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst);
+
+    // Determine output index
+    uint     idx               = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y;
+    __global uchar *output_ptr = dst.ptr + idx;
+
+    // Determine current input index
+    const int top_left_x = get_global_id(0) * strides.x - paddings.x;
+    const int top_left_y = get_global_id(1) * strides.y - paddings.y;
+
+    // Linearize convolution elements
+    for(int d = 0; d < kernel_depth; ++d)
+    {
+        for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y)
+        {
+            for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x)
+            {
+                if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y)
+                {
+                    *((__global DATA_TYPE *)output_ptr) = 0;
+                }
+                else
+                {
+                    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d)));
+                }
+            }
+        }
+    }
+
+#if defined HAS_BIAS
+    *((__global DATA_TYPE *)output_ptr) = 1;
+#endif
+}
+
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The output tensor width
+ */
+__kernel void col2im(
+    IMAGE_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint width)
+{
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+    int      idx                         = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x;
+    __global uchar *tmp_out_ptr          = dst.ptr + idx;
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr));
+}
+
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as input.
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+__kernel void im2col_reduced(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst),
+    uint width, uint height)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const uint image_size = width * height;
+
+    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+
+#if defined HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+    {
+        tmp_out_ptr += dst_stride_x;
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
new file mode 100644
index 0000000000..96b9cff3eb
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "convolution3x3.cl"
+#include "convolution5x5.cl"
+#include "convolution7x7.cl"
+#include "convolution9x9.cl"
+#include "helpers.h"
+
+#define MAT_INDEX(i) MAT##i
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_rectangle(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short matrix_coeff[81] =
+    {
+        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
+        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
+        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
+        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
+        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
+        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
+        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
+        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
+        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
+    };
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
+
+    for(int i = 0; i < MATRIX_HEIGHT; i++)
+    {
+#if MATRIX_WIDTH == 3
+        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
+                                 matrix_coeff[2 + i * 3]);
+#endif
+
+#if MATRIX_WIDTH == 5
+        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
+                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
+#endif
+
+#if MATRIX_WIDTH == 7
+        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
+                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
+                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
+#endif
+
+#if MATRIX_WIDTH == 9
+        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
+                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
+                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
+#endif
+    }
+
+    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
new file mode 100644
index 0000000000..c8eaa95352
--- /dev/null
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#else
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif
+
+/** This function performs a down-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_down(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
+    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+    vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function performs a up-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_up(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
new file mode 100644
index 0000000000..0e810d2e7c
--- /dev/null
+++ b/src/core/CL/cl_kernels/derivative.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel that computes the first-order derivative.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void derivative(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
+    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
+    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
+    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
+    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
new file mode 100644
index 0000000000..c62c701757
--- /dev/null
+++ b/src/core/CL/cl_kernels/dilate.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function dilates an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void dilate(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
new file mode 100644
index 0000000000..6576f1827f
--- /dev/null
+++ b/src/core/CL/cl_kernels/erode.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function erodes an input image image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void erode(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
new file mode 100644
index 0000000000..470d14a7b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/fast_corners.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+ *
+ *      . . F 0 1 . . .
+ *      . E . . . 2 . .
+ *      D . . . . . 3 .
+ *      C . . P . . 4 .
+ *      B . . . . . 5 .
+ *      . A . . . 6 . .
+ *      . . 9 8 7 . . .
+ */
+constant int offsets_s[16][2] =
+{
+    { 0, -3 },  // 0
+    { 1, -3 },  // 1
+    { 2, -2 },  // 2
+    { 3, -1 },  // 3
+    { 3, 0 },   // 4
+    { 3, 1 },   // 5
+    { 2, 2 },   // 6
+    { 1, 3 },   // 7
+    { 0, 3 },   // 8
+    { -1, 3 },  // 9
+    { -2, 2 },  // A
+    { -3, 1 },  // B
+    { -3, 0 },  // C
+    { -3, -1 }, // D
+    { -2, -2 }, // E
+    { -1, -3 }, // F
+};
+
+/** Load a pixel and set the mask values.
+ *
+ * @param[in]  ptr         The pointer to the starting address of source image
+ * @param[in]  a           Index to indicate the position in the Bresenham circle
+ * @param[in]  stride      Stride of source image in x dimension
+ * @param[in]  dark        The left end of the threshold range
+ * @param[in]  bright      The right end of the threshold range
+ * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
+ * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
+ *
+ */
+#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
+    {                                                                           \
+        unsigned char pixel;                                                    \
+        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
+        dark_mask |= (pixel < dark) << a;                                       \
+        bright_mask |= (pixel > bright) << a;                                   \
+    }
+
+/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
+ *
+ * @param[in]  bright_mask The mask recording postions of bright pixels
+ * @param[in]  dark_mask   The mask recording postions of dark pixels
+ * @param[out] isCorner    Indicate whether candidate pixel is corner
+ */
+#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
+    {                                                     \
+        for(int i = 0; i < 16; i++)                       \
+        {                                                 \
+            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
+            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
+            if(isCorner)                                  \
+            {                                             \
+                break;                                    \
+            }                                             \
+            bright_mask >>= 1;                            \
+            dark_mask >>= 1;                              \
+        }                                                 \
+    }
+
+/* Calculate pixel's strength */
+uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
+{
+    short a = threshold;
+    short b = 255;
+    while(b - a > 1)
+    {
+        uchar        c           = convert_uchar_sat((a + b) / 2);
+        unsigned int bright_mask = 0;
+        unsigned int dark_mask   = 0;
+
+        unsigned char p_bright = add_sat(candidate_pixel, c);
+        unsigned char p_dark   = sub_sat(candidate_pixel, c);
+
+        bool isCorner = 0;
+
+        for(uint i = 0; i < 16; i++)
+        {
+            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
+        }
+
+        bright_mask |= (bright_mask << 16);
+        dark_mask |= (dark_mask << 16);
+        CHECK_CORNER(bright_mask, dark_mask, isCorner);
+
+        if(isCorner)
+        {
+            a = convert_short(c);
+        }
+        else
+        {
+            b = convert_short(c);
+        }
+    }
+    return a;
+}
+
+/** Fast corners implementation. Calculates and returns the strength of each pixel.
+ *
+ * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
+ * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
+ * from the LSB are set.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  threshold_value                      Threshold value.
+ *
+ */
+__kernel void fast_corners(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output),
+    float threshold_value)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    const unsigned char threshold = (uchar)threshold_value;
+
+    unsigned int bright_mask = 0;
+    unsigned int dark_mask   = 0;
+
+    unsigned char isCorner = 0;
+
+    unsigned char p        = *in.ptr;
+    unsigned char p_bright = add_sat(p, threshold);
+    unsigned char p_dark   = sub_sat(p, threshold);
+
+    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    if(((bright_mask | dark_mask) & 0x1111) == 0)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    bright_mask |= (bright_mask << 16);
+    dark_mask |= (dark_mask << 16);
+
+    CHECK_CORNER(bright_mask, dark_mask, isCorner)
+
+    if(!isCorner)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+#ifndef USE_MAXSUPPRESSION
+    *out.ptr = 1;
+#else
+
+    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
+#endif
+}
+
+/** Copy result to Keypoint buffer and count number of corners
+ *
+ * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
+ * @param[out] offset                              The number of skipped pixels in x dimension
+ * @param[out] num_of_points                       Number of points found
+ * @param[out] out                                 The keypoints found
+ *
+ */
+__kernel void copy_to_keypoint(
+    IMAGE_DECLARATION(input),
+    uint     max_num_points,
+    uint     offset,
+    __global uint *num_of_points,
+    __global Keypoint *out)
+{
+#ifndef UPDATE_NUMBER
+    if(*num_of_points >= max_num_points)
+    {
+        return;
+    }
+#endif
+
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uchar value = *in.ptr;
+
+    if(value > 0)
+    {
+        int id = atomic_inc(num_of_points);
+        if(id < max_num_points)
+        {
+            out[id].strength        = value;
+            out[id].x               = get_global_id(0) + offset;
+            out[id].y               = get_global_id(1) + offset;
+            out[id].tracking_status = 1;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
new file mode 100644
index 0000000000..df635869b1
--- /dev/null
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    IMAGE_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    IMAGE_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
new file mode 100644
index 0000000000..618937f36d
--- /dev/null
+++ b/src/core/CL/cl_kernels/gaussian_pyramid.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian1x5_sub_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values for the convolution (20 bytes needed)
+    uchar16 temp0 = vload16(0, src.ptr);
+    uchar4  temp1 = vload4(0, src.ptr + 16);
+
+    // Convert to USHORT8
+    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
+    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
+    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
+    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
+    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
+
+    // Compute convolution along the X direction
+    ushort8 pixels = l2_data + r2_data;
+    pixels += l1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += r1_data * (ushort8)4;
+
+    // Store result
+    vstore8(pixels, 0, (__global ushort *)dst.ptr);
+}
+
+/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian5x1_sub_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
+    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
+    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
+    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
+    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
+
+    // Compute convolution along the Y direction
+    ushort8 pixels = u2_data + d2_data;
+    pixels += u1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += d1_data * (ushort8)4;
+
+    // Scale result
+    pixels >>= (ushort8)8;
+
+    // Store result
+    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 0000000000..caf6e3ffd8
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,1099 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    float4 b0 = vload4(0, (__global float *)src.ptr);
+
+    vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    half8 b0 = vload8(0, (__global half *)src.ptr);
+
+    vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
+
+    vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U32/S32/F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
+    float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
+    float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
+    float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
+
+    float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
+
+    val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
+
+    val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
+
+    val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U16/S16/F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
+    half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
+    half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
+    half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
+
+    half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
+
+    val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
+
+    val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
+
+    val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8/S8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
+    uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
+    uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
+    uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
+
+    uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
+                             (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
+
+    val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
+                     (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
+
+    val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
+                     (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
+
+    val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
+                     (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f32(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
+    float4 biases_value = vload4(0, (__global float *)biases.ptr);
+    accum_value         = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore4(accum_value, 0, (__global float *)accum.ptr);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f16(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
+    half8 biases_value = vload8(0, (__global half *)biases.ptr);
+    accum_value        = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore8(accum_value, 0, (__global half *)accum.ptr);
+}
+
+#if(defined WIDTH_MATRIX_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: U8
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  a_offset                           Offset to be added to each element of the matrix A
+ * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
+ * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
+ * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
+ * @param[in]  shift                              Number of bits to shift right the result.
+ */
+__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
+                         IMAGE_DECLARATION(src1),
+                         IMAGE_DECLARATION(dst),
+                         int a_offset,
+                         int b_offset,
+                         int c_offset,
+                         int c_mult_int,
+                         int shift)
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    int16 c00 = 0.0f;
+    int16 c10 = 0.0f;
+    int16 c20 = 0.0f;
+    int16 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+
+        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+        c00 += (int16)a0.s4 * b1;
+        c10 += (int16)a0.s5 * b1;
+        c20 += (int16)a0.s6 * b1;
+        c30 += (int16)a0.s7 * b1;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
+    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
+    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
+    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
+
+    /* Store 4x16 block */
+    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
+    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
+    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
+    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
+}
+#endif
+
+#if(defined WIDTH_MATRIX_B && defined ALPHA)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 4 in order to get the src_addr in unit of float */
+    src_addr = src_addr >> 2;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    float4 c00 = 0.0f;
+    float4 c10 = 0.0f;
+    float4 c20 = 0.0f;
+    float4 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (float4)ALPHA;
+    c10 = c10 * (float4)ALPHA;
+    c20 = c20 * (float4)ALPHA;
+    c30 = c30 * (float4)ALPHA;
+
+    /* Store 4x4 block */
+    vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+
+    // Reset accumulators
+    float c00 = 0.0f;
+    float c01 = 0.0f;
+    float c02 = 0.0f;
+    float c03 = 0.0f;
+    float c10 = 0.0f;
+    float c11 = 0.0f;
+    float c12 = 0.0f;
+    float c13 = 0.0f;
+    float c20 = 0.0f;
+    float c21 = 0.0f;
+    float c22 = 0.0f;
+    float c23 = 0.0f;
+    float c30 = 0.0f;
+    float c31 = 0.0f;
+    float c32 = 0.0f;
+    float c33 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4);
+        b0 = vload4(0, src_addr_b + 4);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 8);
+        b0 = vload4(0, src_addr_b + 8);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 12);
+        b0 = vload4(0, src_addr_b + 12);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product
+    c00 = c00 * ALPHA;
+    c01 = c01 * ALPHA;
+    c02 = c02 * ALPHA;
+    c03 = c03 * ALPHA;
+    c10 = c10 * ALPHA;
+    c11 = c11 * ALPHA;
+    c12 = c12 * ALPHA;
+    c13 = c13 * ALPHA;
+    c20 = c20 * ALPHA;
+    c21 = c21 * ALPHA;
+    c22 = c22 * ALPHA;
+    c23 = c23 * ALPHA;
+    c30 = c30 * ALPHA;
+    c31 = c31 * ALPHA;
+    c32 = c32 * ALPHA;
+    c33 = c33 * ALPHA;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    // Store 4x4 block
+    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 2 in order to get the src_addr in unit of half */
+    src_addr = src_addr >> 1;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    half8 c00 = 0.0f;
+    half8 c10 = 0.0f;
+    half8 c20 = 0.0f;
+    half8 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (half8)ALPHA;
+    c10 = c10 * (half8)ALPHA;
+    c20 = c20 * (half8)ALPHA;
+    c30 = c30 * (half8)ALPHA;
+
+    /* Store 4x8 block */
+    vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
+    vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
+    vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
+    vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
+}
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (float4)ALPHA;
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 8;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(half);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
+
+    half8 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
+    {
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+        half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
+        half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+
+        acc += b0 * (half8)a0.s0;
+        acc += b1 * (half8)a0.s1;
+        acc += b2 * (half8)a0.s2;
+        acc += b3 * (half8)a0.s3;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
+    {
+        half a0  = *((__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (half8)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (half8)ALPHA;
+
+    vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
+#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+
+#if(defined BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    /* Load values from Matrix C */
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    /* Load values from Matrix C */
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif /* (defined BETA) */
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @attention The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+                             TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+    int idy = get_global_id(1);
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
new file mode 100644
index 0000000000..5320a064ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/harris_corners.cl
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Function running harris score on 3x3 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_3x3(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float4 gx2  = (float4)0.0f;
+    float4 gy2  = (float4)0.0f;
+    float4 gxgy = (float4)0.0f;
+
+    /* Row0 */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
+
+    float4 l_gx = convert_float4(temp_gx.s0123);
+    float4 m_gx = convert_float4(temp_gx.s1234);
+    float4 r_gx = convert_float4(temp_gx.s2345);
+
+    float4 l_gy = convert_float4(temp_gy.s0123);
+    float4 m_gy = convert_float4(temp_gy.s1234);
+    float4 r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row1 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row2 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x5.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
+
+    float4 gx = convert_float4(temp_gx.s0123);
+    float4 gy = convert_float4(temp_gy.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s1234);
+    gy = convert_float4(temp_gy.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s2345);
+    gy = convert_float4(temp_gy.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s3456);
+    gy = convert_float4(temp_gy.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s4567);
+    gy = convert_float4(temp_gy.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 5x5 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_5x5(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -2; i < 3; i++)
+    {
+        res += harris_score_1x5(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x7.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
+
+    float4 gx = convert_float4(temp_gx0.s0123);
+    float4 gy = convert_float4(temp_gy0.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s1234);
+    gy = convert_float4(temp_gy0.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s2345);
+    gy = convert_float4(temp_gy0.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s3456);
+    gy = convert_float4(temp_gy0.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s4567);
+    gy = convert_float4(temp_gy0.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 7x7 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_7x7(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -3; i < 4; i++)
+    {
+        res += harris_score_1x7(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 0000000000..6db8ed567c
--- /dev/null
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+__global inline const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+__global inline uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+#endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
new file mode 100644
index 0000000000..a652b28e6a
--- /dev/null
+++ b/src/core/CL/cl_kernels/histogram.cl
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VATOMIC_INC16(histogram, win_pos)   \
+    {                                       \
+        atomic_inc(histogram + win_pos.s0); \
+        atomic_inc(histogram + win_pos.s1); \
+        atomic_inc(histogram + win_pos.s2); \
+        atomic_inc(histogram + win_pos.s3); \
+        atomic_inc(histogram + win_pos.s4); \
+        atomic_inc(histogram + win_pos.s5); \
+        atomic_inc(histogram + win_pos.s6); \
+        atomic_inc(histogram + win_pos.s7); \
+        atomic_inc(histogram + win_pos.s8); \
+        atomic_inc(histogram + win_pos.s9); \
+        atomic_inc(histogram + win_pos.sa); \
+        atomic_inc(histogram + win_pos.sb); \
+        atomic_inc(histogram + win_pos.sc); \
+        atomic_inc(histogram + win_pos.sd); \
+        atomic_inc(histogram + win_pos.se); \
+        atomic_inc(histogram + win_pos.sf); \
+    }
+
+/** Calculate the histogram of an 8 bit grayscale image.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
+                                __local uint *histogram_local,
+                                __global uint *restrict histogram,
+                                uint                    num_bins,
+                                uint                    offset,
+                                uint                    range,
+                                uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    uint  local_id_x   = get_local_id(0);
+
+    uint local_x_size = get_local_size(0);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            histogram_local[i] = 0;
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            histogram_local[local_id_x] = 0;
+        }
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    VATOMIC_INC16(histogram_local, win_pos);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            atomic_add(histogram + i, histogram_local[i]);
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
+        }
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image's border.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
+                                 __global uint *restrict histogram,
+                                 uint                    num_bins,
+                                 uint                    offset,
+                                 uint                    range,
+                                 uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint val = (uint)(*input_buffer.ptr);
+
+    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
+
+    if(val >= offset && (val < offrange))
+    {
+        atomic_inc(histogram + win_pos);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256 elements
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
+                                      __local uint *histogram_local,
+                                      __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint local_index  = get_local_id(0);
+    uint local_x_size = get_local_size(0);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        histogram_local[i] = 0;
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    atomic_inc(histogram_local + vals.s0);
+    atomic_inc(histogram_local + vals.s1);
+    atomic_inc(histogram_local + vals.s2);
+    atomic_inc(histogram_local + vals.s3);
+    atomic_inc(histogram_local + vals.s4);
+    atomic_inc(histogram_local + vals.s5);
+    atomic_inc(histogram_local + vals.s6);
+    atomic_inc(histogram_local + vals.s7);
+    atomic_inc(histogram_local + vals.s8);
+    atomic_inc(histogram_local + vals.s9);
+    atomic_inc(histogram_local + vals.sa);
+    atomic_inc(histogram_local + vals.sb);
+    atomic_inc(histogram_local + vals.sc);
+    atomic_inc(histogram_local + vals.sd);
+    atomic_inc(histogram_local + vals.se);
+    atomic_inc(histogram_local + vals.sf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        atomic_add(histogram + i, histogram_local[i]);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
+                                       __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    atomic_inc(histogram + *input_buffer.ptr);
+}
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
new file mode 100644
index 0000000000..31dd57b767
--- /dev/null
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
+ * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
+ * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
+ * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+                                      IMAGE_DECLARATION(phase),
+                                      IMAGE_DECLARATION(dst))
+{
+    float bins[NUM_BINS] = { 0 };
+
+    // Compute address for the magnitude and phase images
+    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+    __global uchar *mag_row_ptr   = mag.ptr;
+    __global uchar *phase_row_ptr = phase.ptr;
+
+    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+    {
+        int xc = 0;
+        for(; xc <= (CELL_WIDTH - 4); xc += 4)
+        {
+            // Load magnitude and phase values
+            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+            // Compute histogram index.
+            int4 hidx_s32 = convert_int4(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float4 hidx_f32 = convert_float4(hidx_s32);
+
+            // w1 = phase_f32 - hidx_s32
+            const float4 w1_f32 = phase_f32 - hidx_f32;
+
+            // w0 = 1.0 - w1
+            const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+            // Calculate the weights for splitting vote
+            const float4 mag_w0_f32 = mag_f32 * w0_f32;
+            const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin 0
+            bins[hidx_s32.s0] += mag_w0_f32.s0;
+            bins[hidx_s32.s1] += mag_w0_f32.s1;
+            bins[hidx_s32.s2] += mag_w0_f32.s2;
+            bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+            hidx_s32 += (int4)1;
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin1
+            bins[hidx_s32.s0] += mag_w1_f32.s0;
+            bins[hidx_s32.s1] += mag_w1_f32.s1;
+            bins[hidx_s32.s2] += mag_w1_f32.s2;
+            bins[hidx_s32.s3] += mag_w1_f32.s3;
+        }
+
+        // Left over computation
+        for(; xc < CELL_WIDTH; xc++)
+        {
+            const float mag_value   = *((__global short *)mag_row_ptr + xc);
+            const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float w1          = phase_value - floor(phase_value);
+
+            // The quantised phase is the histogram index [0, NUM_BINS - 1]
+            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+            const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+            // Weighted vote between 2 bins
+            bins[hidx] += mag_value * (1.0f - w1);
+            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+        }
+
+        // Point to the next row of magnitude and phase images
+        mag_row_ptr += mag_stride_y;
+        phase_row_ptr += phase_stride_y;
+    }
+
+    // Compute address for the destination image
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the local HOG in the global memory
+    int xc = 0;
+    for(; xc <= (NUM_BINS - 4); xc += 4)
+    {
+        float4 values = vload4(0, bins + xc);
+
+        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+    }
+
+    // Left over stores
+    for(; xc < NUM_BINS; ++xc)
+    {
+        ((__global float *)dst.ptr)[xc] = bins[xc];
+    }
+}
+#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    float  sum     = 0.0f;
+    float4 sum_f32 = (float4)(0.0f);
+
+    // Compute address for the source and destination tensor
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+    {
+        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+        int xc = 0;
+        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+        {
+            const float4 val0 = vload4(0, hist_ptr + xc + 0);
+            const float4 val1 = vload4(0, hist_ptr + xc + 4);
+            const float4 val2 = vload4(0, hist_ptr + xc + 8);
+            const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            // Compute val^2 for L2_NORM or L2HYS_NORM
+            sum_f32 += val0 * val0;
+            sum_f32 += val1 * val1;
+            sum_f32 += val2 * val2;
+            sum_f32 += val3 * val3;
+#else
+            // Compute |val| for L1_NORM
+            sum_f32 += fabs(val0);
+            sum_f32 += fabs(val1);
+            sum_f32 += fabs(val2);
+            sum_f32 += fabs(val3);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+            // will be accessed consecutively
+            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+        }
+
+        // Compute left over
+        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+        {
+            const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            sum += val * val;
+#else
+            sum += fabs(val);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+        }
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+    // Reset sum
+    sum_f32 = (float4)0.0f;
+    sum     = 0.0f;
+
+    int k = 0;
+    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+        // Scale val
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        // Clip val if over _threshold_l2hys
+        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+        // Compute val^2
+        sum_f32 += val0 * val0;
+        sum_f32 += val1 * val1;
+        sum_f32 += val2 * val2;
+        sum_f32 += val3 * val3;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+    }
+
+    // Compute left over
+    for(; k < NUM_BINS_PER_BLOCK; ++k)
+    {
+        float val = ((__global float *)dst.ptr)[k] * scale;
+
+        // Clip scaled input_value if over L2_HYST_THRESHOLD
+        val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+        sum += val * val;
+
+        ((__global float *)dst.ptr)[k] = val;
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    // We use the same constants of OpenCV
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+
+    int i = 0;
+    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+        // Multiply val by the normalization scale factor
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+    }
+
+    for(; i < NUM_BINS_PER_BLOCK; ++i)
+    {
+        ((__global float *)dst.ptr)[i] *= scale;
+    }
+}
+#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
+ * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst                               Pointer to DetectionWindow array
+ * @param[out] num_detection_windows             Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+                           __global float *hog_descriptor,
+                           __global DetectionWindow *dst,
+                           __global uint *num_detection_windows)
+{
+    // Check if the DetectionWindow array is full
+    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+    {
+        return;
+    }
+
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+    // Init score_f32 with 0
+    float4 score_f32 = (float4)0.0f;
+
+    // Init score with 0
+    float score = 0.0f;
+
+    __global float *src_row_ptr = (__global float *)src.ptr;
+
+    // Compute Linear SVM
+    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+    {
+        int xb = 0;
+
+        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+        {
+            // Load descriptor values
+            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+            // Multiply accumulate
+            score_f32 += a0_f32 * b0_f32;
+            score_f32 += a1_f32 * b1_f32;
+        }
+
+        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+        {
+            const float a = src_row_ptr[xb];
+            const float b = hog_descriptor[xb + offset_y];
+
+            score += a * b;
+        }
+    }
+
+    score += dot(score_f32, (float4)1.0f);
+
+    // Add the bias. The bias is located at the position (descriptor_size() - 1)
+    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+    if(score > (float)THRESHOLD)
+    {
+        int id = atomic_inc(num_detection_windows);
+        if(id < MAX_NUM_DETECTION_WINDOWS)
+        {
+            dst[id].x         = get_global_id(0) * BLOCK_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+            dst[id].width     = DETECTION_WINDOW_WIDTH;
+            dst[id].height    = DETECTION_WINDOW_HEIGHT;
+            dst[id].idx_class = IDX_CLASS;
+            dst[id].score     = score;
+        }
+    }
+}
+#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
new file mode 100644
index 0000000000..970e04e150
--- /dev/null
+++ b/src/core/CL/cl_kernels/integral_image.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void integral_horizontal(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint prev = 0;
+
+    for(uint j = 0; j < src_step_x; j += 16)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
+        res.s0 += prev;
+        res.s1 += res.s0;
+        res.s2 += res.s1;
+        res.s3 += res.s2;
+        res.s4 += res.s3;
+        res.s5 += res.s4;
+        res.s6 += res.s5;
+        res.s7 += res.s6;
+        res.s8 += res.s7;
+        res.s9 += res.s8;
+        res.sA += res.s9;
+        res.sB += res.sA;
+        res.sC += res.sB;
+        res.sD += res.sC;
+        res.sE += res.sD;
+        res.sF += res.sE;
+        prev = res.sF;
+        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
+    }
+}
+
+/** This function computes the vertical integral of the image.
+ *
+ * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
+ * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     height                            Image height.
+ */
+__kernel void integral_vertical(
+    IMAGE_DECLARATION(src),
+    uint height)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
+    for(uint j = 1; j < height; ++j)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
+        res += prev;
+        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
+        prev = res;
+    }
+}
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
new file mode 100644
index 0000000000..c4b0df8de9
--- /dev/null
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculates L1 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L1 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates L2 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L2 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
+{
+    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
+                       VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates unsigned phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
+ */
+inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+    return convert_uchar16(angle_deg_f32);
+}
+
+/** Calculates signed phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
+ */
+inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
+    arct         = select(arct, arct + 2, arct < 0.0f);
+
+    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & 0xFFu);
+}
+
+#if(1 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
+#elif(2 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
+#else
+#define MAGNITUDE_OP(x, y)
+#endif
+
+#if(1 == PHASE)
+#define PHASE_OP(x, y) phase_unsigned((x), (y))
+#elif(2 == PHASE)
+#define PHASE_OP(x, y) phase_signed((x), (y))
+#else
+#define PHASE_OP(x, y)
+#endif
+
+/** Calculate the magnitude and phase of given the gradients of an image.
+ *
+ * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
+ * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
+ *
+ * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
+ * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
+ * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
+ *
+ * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
+ * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
+ * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
+ * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
+ * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
+ * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
+ * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
+ * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
+ * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
+ * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * */
+__kernel void magnitude_phase(
+    IMAGE_DECLARATION(gx),
+    IMAGE_DECLARATION(gy)
+#ifdef MAGNITUDE
+    ,
+    IMAGE_DECLARATION(magnitude)
+#endif
+#ifdef PHASE
+    ,
+    IMAGE_DECLARATION(phase)
+#endif
+)
+{
+    // Get pixels pointer
+    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
+    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
+
+    // Calculate and store the results
+#ifdef MAGNITUDE
+    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
+    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
+#endif
+#ifdef PHASE
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
new file mode 100644
index 0000000000..50b8312548
--- /dev/null
+++ b/src/core/CL/cl_kernels/mean_stddev.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  height                            Height of the input image
+ * @param[out] global_sum                        Global sum of all elements
+ * @param[out] global_sum_sq                     Global sum of squares of all elements
+ */
+__kernel void mean_stddev_accumulate(
+    IMAGE_DECLARATION(src),
+    uint     height,
+    __global ulong *global_sum
+#if defined         STDDEV
+    ,
+    __global ulong *global_sum_sq
+#endif
+)
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8   tmp_sum = 0;
+#if defined STDDEV
+    uint8   tmp_sum_sq = 0;
+#endif
+    // Calculate partial sum
+    for(int i = 0; i < height; i++)
+    {
+        // Load data
+        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
+
+        tmp_sum += data;
+#if defined STDDEV
+        tmp_sum_sq += data * data;
+#endif
+    }
+    // Perform reduction
+    tmp_sum.s0123 += tmp_sum.s4567;
+    tmp_sum.s01 += tmp_sum.s23;
+    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
+
+#if defined STDDEV
+    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
+    tmp_sum_sq.s01 += tmp_sum_sq.s23;
+    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
+#endif
+}
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
new file mode 100644
index 0000000000..799b1e8c3b
--- /dev/null
+++ b/src/core/CL/cl_kernels/minmaxloc.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#ifndef DATA_TYPE_MIN
+#define DATA_TYPE_MIN 0x0
+#endif
+
+#ifndef DATA_TYPE_MAX
+#define DATA_TYPE_MAX 0xFF
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** This function identifies the min and maximum value of an input image.
+ *
+ * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
+ * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
+ * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[in]  width                             Input image width
+ */
+__kernel void minmax(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    uint          width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Initialize local minimum and local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_min = type_max;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_max = type_min;
+
+    // Calculate min/max of row
+    uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        local_min = min(data, local_min);
+        local_max = max(data, local_max);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    widx      = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+    local_max = max(local_max, select(type_min, data, widx));
+    local_min = min(local_min, select(type_max, data, widx));
+#endif
+
+    // Perform min/max reduction
+    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
+    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
+
+    local_min.s0123 = min(local_min.s0123, local_min.s4567);
+    local_max.s0123 = max(local_max.s0123, local_max.s4567);
+
+    local_min.s01 = min(local_min.s01, local_min.s23);
+    local_max.s01 = max(local_max.s01, local_max.s23);
+
+    local_min.s0 = min(local_min.s0, local_min.s1);
+    local_max.s0 = max(local_max.s0, local_max.s1);
+
+    // Update global min/max
+    atomic_min(&min_max[0], local_min.s0);
+    atomic_max(&min_max[1], local_max.s0);
+}
+
+/** This function counts the min and max occurrences in an image and tags their position.
+ *
+ * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
+ * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
+ * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
+ * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
+ * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
+ * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
+ */
+__kernel void minmaxloc(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    __global uint *min_max_count
+#if defined        LOCATE_MIN
+    ,
+    __global Coordinates2D *min_loc, uint max_min_loc_count
+#endif
+#if defined LOCATE_MAX
+    ,
+    __global Coordinates2D *max_loc, uint max_max_loc_count
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
+#if defined COUNT_MIN_MAX
+    if(value == min_max[0])
+    {
+        uint idx = atomic_inc(&min_max_count[0]);
+#if defined  LOCATE_MIN
+        if(idx < max_min_loc_count)
+        {
+            min_loc[idx].x = get_global_id(0);
+            min_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+    if(value == min_max[1])
+    {
+        uint idx = atomic_inc(&min_max_count[1]);
+#if defined  LOCATE_MAX
+        if(idx < max_max_loc_count)
+        {
+            max_loc[idx].x = get_global_id(0);
+            max_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
new file mode 100644
index 0000000000..f860c96bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+/** This function applies a non linear filter on a 3x3 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar8  top    = vload8(0, offset(&src, 0, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar8  bottom = vload8(0, offset(&src, 0, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_3(middle);
+    uchar8    out        = min(tmp_middle, min(top, bottom));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_3(middle);
+    uchar8  out        = max(tmp_middle, max(top, bottom));
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = middle.s01234567;
+    uchar8 p2  = middle.s12345678;
+    uchar8 p3  = middle.s23456789;
+    uchar8 p4  = bottom.s01234567;
+    uchar8 out = sort5(p0, p1, p2, p3, p4);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp        = max(top, max(middle, bottom));
+    uchar8  out        = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
new file mode 100644
index 0000000000..d9ae95fd2d
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 8 disks of diameter 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top.s01234567;
+    uchar8 p4  = top.s12345678;
+    uchar8 p5  = top.s23456789;
+    uchar8 p6  = top.s3456789A;
+    uchar8 p7  = top.s456789AB;
+    uchar8 p8  = middle.s01234567;
+    uchar8 p9  = middle.s12345678;
+    uchar8 p10 = middle.s23456789;
+    uchar8 p11 = middle.s3456789A;
+    uchar8 p12 = middle.s456789AB;
+    uchar8 p13 = bottom.s01234567;
+    uchar8 p14 = bottom.s12345678;
+    uchar8 p15 = bottom.s23456789;
+    uchar8 p16 = bottom.s3456789A;
+    uchar8 p17 = bottom.s456789AB;
+    uchar8 p18 = bottom2.s01234567;
+    uchar8 p19 = bottom2.s12345678;
+    uchar8 p20 = bottom2.s23456789;
+
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p4, p5);
+    SORT(p6, p7);
+    SORT(p8, p9);
+    SORT(p10, p11);
+    SORT(p12, p13);
+    SORT(p14, p15);
+    SORT(p16, p17);
+    SORT(p18, p19);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p4, p6);
+    SORT(p5, p7);
+    SORT(p8, p10);
+    SORT(p9, p11);
+    SORT(p12, p14);
+    SORT(p13, p15);
+    SORT(p16, p18);
+    SORT(p17, p19);
+    SORT(p1, p2);
+    SORT(p5, p6);
+    SORT(p0, p4);
+    SORT(p3, p7);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p8, p12);
+    SORT(p11, p15);
+    SORT(p17, p18);
+    SORT(p16, p20);
+    SORT(p1, p5);
+    SORT(p2, p6);
+    SORT(p9, p13);
+    SORT(p10, p14);
+    SORT(p0, p8);
+    SORT(p7, p15);
+    SORT(p17, p20);
+    SORT(p1, p4);
+    SORT(p3, p6);
+    SORT(p9, p12);
+    SORT(p11, p14);
+    SORT(p18, p20);
+    SORT(p0, p16);
+    SORT(p2, p4);
+    SORT(p3, p5);
+    SORT(p10, p12);
+    SORT(p11, p13);
+    SORT(p1, p9);
+    SORT(p6, p14);
+    SORT(p19, p20);
+    SORT(p3, p4);
+    SORT(p11, p12);
+    SORT(p1, p8);
+    SORT(p2, p10);
+    SORT(p5, p13);
+    SORT(p7, p14);
+    SORT(p3, p11);
+    SORT(p2, p8);
+    SORT(p4, p12);
+    SORT(p7, p13);
+    SORT(p1, p17);
+    SORT(p3, p10);
+    SORT(p5, p12);
+    SORT(p1, p16);
+    SORT(p2, p18);
+    SORT(p3, p9);
+    SORT(p6, p12);
+    SORT(p2, p16);
+    SORT(p3, p8);
+    SORT(p7, p12);
+    SORT(p5, p9);
+    SORT(p6, p10);
+    SORT(p4, p8);
+    SORT(p7, p11);
+    SORT(p3, p19);
+    SORT(p5, p8);
+    SORT(p7, p10);
+    SORT(p3, p18);
+    SORT(p4, p20);
+    SORT(p6, p8);
+    SORT(p7, p9);
+    SORT(p3, p17);
+    SORT(p5, p20);
+    SORT(p7, p8);
+    SORT(p3, p16);
+    SORT(p6, p20);
+    SORT(p5, p17);
+    SORT(p7, p20);
+    SORT(p4, p16);
+    SORT(p6, p18);
+    SORT(p5, p16);
+    SORT(p7, p19);
+    SORT(p7, p18);
+    SORT(p6, p16);
+    SORT(p7, p17);
+    SORT(p10, p18);
+    SORT(p7, p16);
+    SORT(p9, p17);
+    SORT(p8, p16);
+    SORT(p9, p16);
+    SORT(p10, p16);
+
+    return p10;
+}
+
+/** Sorting network to sort 8 boxes of size 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top2.s3456789A;
+    uchar8 p4  = top2.s456789AB;
+    uchar8 p5  = top.s01234567;
+    uchar8 p6  = top.s12345678;
+    uchar8 p7  = top.s23456789;
+    uchar8 p8  = top.s3456789A;
+    uchar8 p9  = top.s456789AB;
+    uchar8 p10 = middle.s01234567;
+    uchar8 p11 = middle.s12345678;
+    uchar8 p12 = middle.s23456789;
+    uchar8 p13 = middle.s3456789A;
+    uchar8 p14 = middle.s456789AB;
+    uchar8 p15 = bottom.s01234567;
+    uchar8 p16 = bottom.s12345678;
+    uchar8 p17 = bottom.s23456789;
+    uchar8 p18 = bottom.s3456789A;
+    uchar8 p19 = bottom.s456789AB;
+    uchar8 p20 = bottom2.s01234567;
+    uchar8 p21 = bottom2.s12345678;
+    uchar8 p22 = bottom2.s23456789;
+    uchar8 p23 = bottom2.s3456789A;
+    uchar8 p24 = bottom2.s456789AB;
+
+    SORT(p1, p2);
+    SORT(p0, p1);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p3, p4);
+    SORT(p4, p5);
+    SORT(p0, p3);
+    SORT(p2, p5);
+    SORT(p2, p3);
+    SORT(p1, p4);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p7, p8);
+    SORT(p6, p7);
+    SORT(p7, p8);
+    SORT(p10, p11);
+    SORT(p9, p10);
+    SORT(p10, p11);
+    SORT(p6, p9);
+    SORT(p8, p11);
+    SORT(p8, p9);
+    SORT(p7, p10);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p0, p6);
+    SORT(p4, p10);
+    SORT(p4, p6);
+    SORT(p2, p8);
+    SORT(p2, p4);
+    SORT(p6, p8);
+    SORT(p1, p7);
+    SORT(p5, p11);
+    SORT(p5, p7);
+    SORT(p3, p9);
+    SORT(p3, p5);
+    SORT(p7, p9);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p5, p6);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p12, p13);
+    SORT(p13, p14);
+    SORT(p16, p17);
+    SORT(p15, p16);
+    SORT(p16, p17);
+    SORT(p12, p15);
+    SORT(p14, p17);
+    SORT(p14, p15);
+    SORT(p13, p16);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p19, p20);
+    SORT(p18, p19);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p21, p23);
+    SORT(p22, p24);
+    SORT(p22, p23);
+    SORT(p18, p21);
+    SORT(p20, p23);
+    SORT(p20, p21);
+    SORT(p19, p22);
+    SORT(p22, p24);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p12, p18);
+    SORT(p16, p22);
+    SORT(p16, p18);
+    SORT(p14, p20);
+    SORT(p20, p24);
+    SORT(p14, p16);
+    SORT(p18, p20);
+    SORT(p22, p24);
+    SORT(p13, p19);
+    SORT(p17, p23);
+    SORT(p17, p19);
+    SORT(p15, p21);
+    SORT(p15, p17);
+    SORT(p19, p21);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p17, p18);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p0, p12);
+    SORT(p8, p20);
+    SORT(p8, p12);
+    SORT(p4, p16);
+    SORT(p16, p24);
+    SORT(p12, p16);
+    SORT(p2, p14);
+    SORT(p10, p22);
+    SORT(p10, p14);
+    SORT(p6, p18);
+    SORT(p6, p10);
+    SORT(p10, p12);
+    SORT(p1, p13);
+    SORT(p9, p21);
+    SORT(p9, p13);
+    SORT(p5, p17);
+    SORT(p13, p17);
+    SORT(p3, p15);
+    SORT(p11, p23);
+    SORT(p11, p15);
+    SORT(p7, p19);
+    SORT(p7, p11);
+    SORT(p11, p13);
+    SORT(p11, p12);
+    return p12;
+}
+
+/** This function applies a non linear filter on a 5x5 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+    uchar8    out = row_reduce_min_5(tmp);
+#elif defined MAX
+    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
+    uchar8  out = row_reduce_max_5(tmp);
+#elif defined MEDIAN
+    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, 0, -2));
+    uchar16 top     = vload16(0, offset(&src, 0, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, 0, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, 0, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_5(middle);
+    uchar8    out        = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567)));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_5(middle);
+    uchar8  out        = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567)));
+#elif defined MEDIAN
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top.s01234567;
+    uchar8 p2  = middle.s01234567;
+    uchar8 p3  = middle.s12345678;
+    uchar8 p4  = middle.s23456789;
+    uchar8 p5  = middle.s3456789A;
+    uchar8 p6  = middle.s456789AB;
+    uchar8 p7  = bottom.s01234567;
+    uchar8 p8  = bottom2.s01234567;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -1, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -1, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp_3     = min(top2, bottom2);
+    uchar16   tmp_5     = min(middle, min(top, bottom));
+    uchar8    tmp_3_red = row_reduce_min_3(tmp_3);
+    uchar8    tmp_5_red = row_reduce_min_5(tmp_5);
+    uchar8    out       = min(tmp_3_red, tmp_5_red);
+#elif defined MAX
+    uchar16 tmp_3      = max(top2, bottom2);
+    uchar16 tmp_5      = max(middle, max(top, bottom));
+    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
+    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
+    uchar8  out        = max(tmp_3_red, tmp_5_red);
+#elif defined MEDIAN
+    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
new file mode 100644
index 0000000000..77da2091b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
new file mode 100644
index 0000000000..0e388d7496
--- /dev/null
+++ b/src/core/CL/cl_kernels/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
new file mode 100644
index 0000000000..076b0d8909
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply cross map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    DATA_TYPE acc = 0;
+
+    const int num_of_slices = get_global_size(2);
+    const int current_slice = get_global_id(2);
+
+    const int left_slice  = max(current_slice - (int)radius, (int)0);
+    const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1));
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice);
+    }
+
+    const float normalized = pow(kappa + coeff * (float)acc, beta);
+
+    const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized;
+
+    *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE);
+}
+
+/** Apply in map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    acc_vec = 0;
+
+    const int current_pos = get_global_id(0) << 2;
+
+    const int left_pos  = max(current_pos - (int)radius, -3);
+    const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1));
+
+    for(int i = left_pos; i <= right_pos; i += 1)
+    {
+        acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0));
+    }
+
+    const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta);
+
+    const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized;
+
+    vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
new file mode 100644
index 0000000000..e1131d5573
--- /dev/null
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/*
+ *The criteria for lost tracking is that the spatial gradient matrix has:
+ * - Determinant less than DETERMINANT_THR
+ * - or minimum eigenvalue is smaller then EIGENVALUE_THR
+ *
+ * The thresholds for the determinant and the minimum eigenvalue is
+ * defined by the OpenVX spec
+ *
+ * Note: Also lost tracking happens when the point tracked coordinate is outside
+ * the image coordinates
+ *
+ * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
+ */
+
+/* Internal Lucas-Kanade Keypoint struct */
+typedef struct InternalKeypoint
+{
+    float x;               /**< The x coordinate. */
+    float y;               /**< The y coordinate. */
+    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float dummy;
+} InternalKeypoint;
+
+/** Threshold for the determinant. Used for lost tracking criteria */
+#define DETERMINANT_THR 1.0e-07f
+
+/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
+#define EIGENVALUE_THR 1.0e-04f
+
+/** Constants used for Lucas-Kanade Algorithm */
+#define W_BITS (14)
+#define FLT_SCALE (1.0f / (float)(1 << 20))
+#define D0 ((float)(1 << W_BITS))
+#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
+
+/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
+ *
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level(
+    __global float4 *old_points_internal,
+    __global float4 *new_points_internal,
+    const float      scale)
+{
+    int idx = get_global_id(0);
+
+    // Get old and new keypoints
+    float4 old_point = old_points_internal[idx];
+    float4 new_point = new_points_internal[idx];
+
+    // Scale accordingly with the pyramid_scale
+    old_point.xy *= (float2)(2.0f);
+    new_point.xy *= (float2)(2.0f);
+
+    old_points_internal[idx] = old_point;
+    new_points_internal[idx] = new_point;
+}
+
+/** Initializes the internal new points array when the level of pyramid is equal to max.
+ *
+ * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max(
+    __global Keypoint *old_points,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint old_point = old_points[idx];
+
+    // Get old keypoint to track
+    InternalKeypoint old_point_internal;
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = old_point_internal;
+}
+
+/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
+ *
+ * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale                Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max_initial_estimate(
+    __global Keypoint *old_points,
+    __global Keypoint *new_points_estimates,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint         old_point          = old_points[idx];
+    Keypoint         new_point_estimate = new_points_estimates[idx];
+    InternalKeypoint old_point_internal;
+    InternalKeypoint new_point_internal;
+
+    // Get old keypoint to track
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Get new keypoint to track
+    new_point_internal.x               = new_point_estimate.x * scale;
+    new_point_internal.y               = new_point_estimate.y * scale;
+    new_point_internal.tracking_status = new_point_estimate.tracking_status;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = new_point_internal;
+}
+
+/** Truncates the coordinates stored in new_points array
+ *
+ * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
+ * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
+ */
+__kernel void finalize(
+    __global InternalKeypoint *new_points_internal,
+    __global Keypoint *new_points)
+{
+    int idx = get_global_id(0);
+
+    // Load internal keypoint
+    InternalKeypoint new_point_internal = new_points_internal[idx];
+
+    // Calculate output point
+    Keypoint new_point;
+    new_point.x               = round(new_point_internal.x);
+    new_point.y               = round(new_point_internal.y);
+    new_point.tracking_status = new_point_internal.tracking_status;
+
+    // Store new point
+    new_points[idx] = new_point;
+}
+
+/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
+ *
+ * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
+ * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
+ * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
+ * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
+ * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
+ * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
+ * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
+ * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
+ * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
+ * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
+ * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
+ */
+void __kernel lktracker_stage0(
+    IMAGE_DECLARATION(old_image),
+    IMAGE_DECLARATION(old_scharr_gx),
+    IMAGE_DECLARATION(old_scharr_gy),
+    __global float4 *old_points,
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0)
+{
+    int idx = get_global_id(0);
+
+    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
+    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
+    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
+
+    // Get old keypoint
+    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
+
+    // Get the floor value
+    float2 iold_keypoint = floor(old_keypoint);
+
+    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0.0f;
+        }
+
+        // Not valid coordinate. It sets min_eig to 0.0f
+        coeff[idx].s3 = 0.0f;
+
+        return;
+    }
+
+    // Compute weight for the bilinear interpolation
+    float2 ab = old_keypoint - iold_keypoint;
+
+    // Weight used for Bilinear-Interpolation on Scharr images
+    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
+    // w_scharr.s1 = ab.x * (1.0f - ab.y)
+    // w_scharr.s2 = (1.0f - ab.x) * ab.y
+    // w_scharr.s3 = ab.x * ab.y
+
+    float4 w_scharr;
+    w_scharr.s3  = ab.x * ab.y;
+    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
+    w_scharr.s12 = ab - (float2)w_scharr.s3;
+
+    // Weight used for Bilinear-Interpolation on Old and New images
+    // w.s0 = round(w_scharr.s0 * D0)
+    // w.s1 = round(w_scharr.s1 * D0)
+    // w.s2 = round(w_scharr.s2 * D0)
+    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+    float4 w;
+    w    = round(w_scharr * (float4)D0);
+    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    int4 iG = (int4)0;
+
+    // Window offset
+    int window_offset = idx * window_dimension_pow2;
+
+    // Compute Spatial Gradient Matrix G
+    for(ushort ky = 0; ky < window_dimension; ++ky)
+    {
+        int offset_y = iold_keypoint.y + ky;
+        for(ushort kx = 0; kx < window_dimension; ++kx)
+        {
+            int    offset_x = iold_keypoint.x + kx;
+            float4 px;
+
+            // Load values from old_image for computing the bilinear interpolation
+            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
+                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
+
+            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
+            float4 old_i;
+
+            // Compute bilinear interpolation (with D1 scale factor) for ival
+            old_i.s0 = dot(px, w) * D1;
+
+            // Load values from old_scharr_gx for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for ixval
+            old_i.s1 = dot(px, w_scharr);
+
+            // Load values from old_scharr_gy for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for iyval
+            old_i.s2 = dot(px, w_scharr);
+
+            // Rounding (it could be omitted. Used just for matching the VX implementation)
+            int4 iold = convert_int4(round(old_i));
+
+            // Accumulate values in the Spatial Gradient Matrix
+            iG.s0 += (int)(iold.s1 * iold.s1);
+            iG.s1 += (int)(iold.s1 * iold.s2);
+            iG.s2 += (int)(iold.s2 * iold.s2);
+
+            // Store ival, ixval and iyval
+            iold_val[window_offset + kx] = convert_short4(iold);
+        }
+        window_offset += window_dimension;
+    }
+
+    // Scale iA11, iA12 and iA22
+    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
+
+    // Compute minimum eigen value
+    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
+
+    // Store A11. A11, A22 and min_eig
+    coeff[idx] = G;
+}
+
+/** Computes the motion vector for a given keypoint
+ *
+ * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
+ * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
+ * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
+ * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
+ * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                             The half size of the window on which to perform the algorithm
+ * @param[in]      num_iterations                          The maximum number of iterations
+ * @param[in]      epsilon                                 The value for terminating the algorithm.
+ * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
+ * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
+ * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ */
+void __kernel lktracker_stage1(
+    IMAGE_DECLARATION(new_image),
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const int        num_iterations,
+    const float      epsilon,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0,
+    const int        term_iteration,
+    const int        term_epsilon)
+{
+    int   idx       = get_global_id(0);
+    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    float4 G = coeff[idx];
+
+    // Determinant
+    float D = G.s0 * G.s2 - G.s1 * G.s1;
+
+    // Check if it is a good point to track
+    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0;
+        }
+
+        return;
+    }
+
+    // Compute inverse
+    //D = native_recip(D);
+    D = 1.0 / D;
+
+    // Get new keypoint
+    float2 new_keypoint = new_points[idx].xy - (float)half_window;
+
+    // Get new point
+    float2 out_new_point = new_points[idx].xy;
+
+    // Keep delta obtained in the previous iteration
+    float2 prev_delta = (float2)0.0f;
+
+    int j = 0;
+    while(j < num_iterations)
+    {
+        // Get the floor value
+        float2 inew_keypoint = floor(new_keypoint);
+
+        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
+        {
+            if(level0 == 1)
+            {
+                // Invalidate tracked point as we are at level 0
+                new_points[idx].s2 = 0.0f;
+            }
+            else
+            {
+                new_points[idx].xy = out_new_point;
+            }
+
+            return;
+        }
+
+        // Compute weight for the bilinear interpolation
+        float2 ab = new_keypoint - inew_keypoint;
+
+        // Weight used for Bilinear-Interpolation on Old and New images
+        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
+        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
+        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
+        // w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+        float4 w;
+        w.s3  = ab.x * ab.y;
+        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
+        w.s12 = ab - (float2)w.s3;
+        w     = round(w * (float4)D0);
+        w.s3  = D0 - w.s0 - w.s1 - w.s2;
+
+        // Mismatch vector
+        int2 ib = 0;
+
+        // Old val offset
+        int old_val_offset = idx * window_dimension_pow2;
+
+        for(int ky = 0; ky < window_dimension; ++ky)
+        {
+            for(int kx = 0; kx < window_dimension; ++kx)
+            {
+                // ival, ixval and iyval have been computed in the previous stage
+                int4 old_ival = convert_int4(iold_val[old_val_offset]);
+
+                // Load values from old_image for computing the bilinear interpolation
+                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
+                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
+
+                // Compute bilinear interpolation on new image
+                int jval = (int)round(dot(px, w) * D1);
+
+                // Compute luminance difference
+                int diff = (int)(jval - old_ival.s0);
+
+                // Accumulate values in mismatch vector
+                ib += (diff * old_ival.s12);
+
+                // Update old val offset
+                old_val_offset++;
+            }
+        }
+
+        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
+
+        // Optical Flow
+        float2 delta;
+
+        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
+        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
+
+        // Update new point coordinate
+        new_keypoint += delta;
+
+        out_new_point = new_keypoint + (float2)half_window;
+
+        if(term_epsilon == 1)
+        {
+            float mag2 = dot(delta, delta);
+
+            if(mag2 <= epsilon)
+            {
+                new_points[idx].xy = out_new_point;
+
+                return;
+            }
+        }
+
+        // Check convergence analyzing the previous delta
+        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
+        {
+            out_new_point -= delta * (float2)0.5f;
+
+            new_points[idx].xy = out_new_point;
+
+            return;
+        }
+
+        // Update previous delta
+        prev_delta = delta;
+
+        if(term_iteration == 1)
+        {
+            j++;
+        }
+    }
+
+    new_points[idx].xy = out_new_point;
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..ae2031f422
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication
+#if defined DATA_TYPE_FLOAT
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif
+
+    // Store result
+    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..05c437cd17
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x))
+#else
+#define CONVERT_OP_INT_STR(x, type) (convert_##type(x))
+#endif
+#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32
+ */
+__kernel void pixelwise_mul_int(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const uint scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication and store result
+    vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
new file mode 100644
index 0000000000..1902df9b7d
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined POOL_AVG
+#define POOL_OP(x, y) ((x) + (y))
+#else
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif
+
+float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                          const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(0) * stride_x - pad_x;
+    int start_y = get_global_id(1) * stride_y - pad_y;
+    int end_x   = min(start_x + pool_size, upper_bound_w);
+    int end_y   = min(start_y + pool_size, upper_bound_h);
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    data0         = POOL_OP(data0, data2);
+    DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
new file mode 100644
index 0000000000..e0f3bf3468
--- /dev/null
+++ b/src/core/CL/cl_kernels/remap.cl
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+    map_coords += (float8)(0.5f);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
new file mode 100644
index 0000000000..9ef33b83ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height);
+    vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
new file mode 100644
index 0000000000..ef9878c1a3
--- /dev/null
+++ b/src/core/CL/cl_kernels/scharr_filter.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes Scharr3x3.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scharr3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-3);
+    gy += middle * (short8)(-10);
+    gy += right * (short8)(-3);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-10);
+    gx += right * (short8)(+10);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+3);
+    gy += middle * (short8)(+10);
+    gy += right * (short8)(+3);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
new file mode 100644
index 0000000000..4eb0eef770
--- /dev/null
+++ b/src/core/CL/cl_kernels/sobel_filter.cl
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/***********************************************/
+/*   Begin implementation of Sobel3x3 filter   */
+/***********************************************/
+
+/** This OpenCL kernel that computes a Sobel3x3 filter.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-1);
+    gy += middle * (short8)(-2);
+    gy += right * (short8)(-1);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-2);
+    gx += right * (short8)(+2);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+1);
+    gy += middle * (short8)(+2);
+    gy += right * (short8)(+1);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel3x3 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel5x5 filter   */
+/***********************************************/
+
+/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src             Pointer to source image.
+ * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
+ * @param[in] left2_coeff_gx  Weight of the left pixel for gx
+ * @param[in] middle_coeff_gx Weight of the middle pixel for gx
+ * @param[in] right1_coeff_gx Weight of the right pixel for gx
+ * @param[in] right2_coeff_gx Weight of the most right pixel for gx
+ * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
+ * @param[in] left2_coeff_gy  Weight of the left pixel for gy
+ * @param[in] middle_coeff_gy Weight of the middle pixel for gy
+ * @param[in] right1_coeff_gy Weight of the right pixel for gy
+ * @param[in] right2_coeff_gy Weight of the most right pixel for gy
+ *
+ * @return a short16 containing short8 gx and short8 gy values.
+ */
+short16 sobel1x5(
+    Image      *src,
+    const short left1_coeff_gx,
+    const short left2_coeff_gx,
+    const short middle_coeff_gx,
+    const short right1_coeff_gx,
+    const short right2_coeff_gx,
+    const short left1_coeff_gy,
+    const short left2_coeff_gy,
+    const short middle_coeff_gy,
+    const short right1_coeff_gy,
+    const short right2_coeff_gy)
+{
+    uchar16 temp = vload16(0, offset(src, -2, 0));
+    short8  gx   = 0;
+    short8  gy   = 0;
+    short8  val;
+
+    val = convert_short8(temp.s01234567);
+    gx += val * (short8)left1_coeff_gx;
+    gy += val * (short8)left1_coeff_gy;
+
+    val = convert_short8(temp.s12345678);
+    gx += val * (short8)left2_coeff_gx;
+    gy += val * (short8)left2_coeff_gy;
+
+    val = convert_short8(temp.s23456789);
+    gx += val * (short8)middle_coeff_gx;
+    gy += val * (short8)middle_coeff_gy;
+
+    val = convert_short8(temp.s3456789a);
+    gx += val * (short8)right1_coeff_gx;
+    gy += val * (short8)right1_coeff_gy;
+
+    val = convert_short8(temp.s456789ab);
+    gx += val * (short8)right2_coeff_gx;
+    gy += val * (short8)right2_coeff_gy;
+
+    return (short16)(gx, gy);
+}
+
+/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+short8 sobel5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    short8 val;
+    short8 out = (short8)0;
+
+    val = vload8(0, (__global short *)offset(src, 0, -2));
+    out += val * (short8)up1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, -1));
+    out += val * (short8)up2_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 0));
+    out += val * (short8)middle_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 1));
+    out += val * (short8)down1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 2));
+    out += val * (short8)down2_coeff;
+
+    return (short8)(out);
+}
+
+/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x5(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+    short16 gx_gy = sobel1x5(&src,
+                             -1, -2, 0, 2, 1,
+                             1, 4, 6, 4, 1);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
+ *  and output two single channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable5x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short8 gx = sobel5x1(&src_x,
+                         1, 4, 6, 4, 1);
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short8 gy = sobel5x1(&src_y,
+                         -1, -2, 0, 2, 1);
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel5x5 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel7x7 filter   */
+/***********************************************/
+
+/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
+#define X0 -1
+#define X1 -4
+#define X2 -5
+#define X3 0
+#define X4 5
+#define X5 4
+#define X6 1
+
+/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
+#define Y0 1
+#define Y1 6
+#define Y2 15
+#define Y3 20
+#define Y4 15
+#define Y5 6
+#define Y6 1
+
+/* Calculates single horizontal iteration. */
+#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
+    {                                                                \
+        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
+        gx += val * X##idx;                                          \
+        gy += val * Y##idx;                                          \
+    }
+
+/* Calculates single vertical iteration. */
+#define SOBEL1x1_VERT(src, g, direction, idx)                          \
+    {                                                                  \
+        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
+        g += val * (int8)direction##idx;                               \
+    }
+
+/* Calculates a 1x7 horizontal iteration. */
+#define SOBEL1x7(ptr, gx, gy)                        \
+    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
+    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
+    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
+    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
+    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
+    SOBEL1x1_HOR(ptr, gx, gy, 5) \
+    SOBEL1x1_HOR(ptr, gx, gy, 6)
+
+/* Calculates a 7x1 vertical iteration. */
+#define SOBEL7x1(ptr, g, direction)                         \
+    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
+    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
+    SOBEL1x1_VERT(ptr, g, direction, 2)             \
+    SOBEL1x1_VERT(ptr, g, direction, 3)         \
+    SOBEL1x1_VERT(ptr, g, direction, 4)     \
+    SOBEL1x1_VERT(ptr, g, direction, 5) \
+    SOBEL1x1_VERT(ptr, g, direction, 6)
+
+/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x7(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+    int8 gx = (int8)0;
+    int8 gy = (int8)0;
+
+    SOBEL1x7(&src, gx, gy);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable7x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    int8 gx = 0;
+    SOBEL7x1(&src_x, gx, Y);
+    vstore8(gx, 0, (__global int *)dst_gx.ptr);
+#endif
+#ifdef GRAD_Y
+    int8 gy = 0;
+    SOBEL7x1(&src_y, gy, X);
+    vstore8(gy, 0, (__global int *)dst_gy.ptr);
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel7x7 filter   */
+/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
new file mode 100644
index 0000000000..632b4a5374
--- /dev/null
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined USE_F16
+#define MINVAL HALF_MIN
+#define SELECT_DATA_TYPE short
+#define DATA_TYPE half
+#else
+#define MINVAL FLT_MIN
+#define SELECT_DATA_TYPE int
+#define DATA_TYPE float
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min;
+
+    // Calculate max of row
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data    = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        max_val = max(data, max_val);
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx    = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    max_val = max(max_val, select(type_min, data, widx));
+#endif
+
+    // Perform max reduction
+    max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF);
+    max_val.s0123     = max(max_val.s0123, max_val.s4567);
+    max_val.s01       = max(max_val.s01, max_val.s23);
+    max_val.s0        = max(max_val.s0, max_val.s1);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: F16, F32
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_shift_exp_sum(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(max),
+    IMAGE_DECLARATION(dst),
+    IMAGE_DECLARATION(sum),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        data = exp(data - max_val);
+        vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
+        sum1D += data;
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    data = exp(data - max_val);
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    data = select(0, data, widx);
+    vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
+    sum1D += data;
+#endif
+
+    // Perform min/max reduction
+    sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF;
+    sum1D.s0123     = sum1D.s0123 + sum1D.s4567;
+    sum1D.s01       = sum1D.s01 + sum1D.s23;
+    sum1D.s0        = sum1D.s0 + sum1D.s1;
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
+    vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+}
diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
new file mode 100644
index 0000000000..cee116bd75
--- /dev/null
+++ b/src/core/CL/cl_kernels/tablelookup.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs table lookup on U8 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: U8
+ */
+__kernel void tablelookup_U8(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global uchar *lut)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    uchar8 data = vload8(0, src.ptr);
+
+    /* Load lut data */
+    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
+                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
+
+    /* Store result */
+    vstore8(lut_data, 0, dst.ptr);
+}
+
+/** This function performs table lookup on S16 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: S16
+ * @param[in]  offset                            LUT offset
+ * @param[in]  count                             Number of elements in the LUT
+ */
+__kernel void tablelookup_S16(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global short *lut,
+    uint            offset,
+    uint            count)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    short8 data = vload8(0, (__global short *)src.ptr);
+
+    /* Load output data */
+    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
+
+    /* Calculate index */
+    int8 index = convert_int8(data) + (int8)(offset);
+    int8 cond  = (index >= 0 && index < (int8)count);
+    index      = select(0, index, cond);
+
+    /* Load lut data */
+    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
+                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
+
+    /* Select output data depending on condition */
+    lut_data = select(out_data, lut_data, cond);
+
+    /* Store result */
+    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
new file mode 100644
index 0000000000..2b1e6ff35d
--- /dev/null
+++ b/src/core/CL/cl_kernels/threshold.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform binary thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  threshold                         The thresold value
+ */
+__kernel void threshold_binary(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar threshold)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform binary thresholding
+    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
+
+/** Perform range thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  lower                             Lower threshold
+ * @param[in]  upper                             Upper threshold
+ */
+__kernel void threshold_range(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar lower,
+    const uchar upper)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform range thresholding
+    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
new file mode 100644
index 0000000000..c30158f280
--- /dev/null
+++ b/src/core/CL/cl_kernels/transpose.cl
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SWAP_ROW(u0, l0)     \
+    ({                       \
+        tmp_swap = u0;       \
+        u0       = l0;       \
+        l0       = tmp_swap; \
+    })
+
+#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
+    ({                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 4)              \
+        tmp_swap;                                \
+        SWAP_ROW(u0, l0);                        \
+        SWAP_ROW(u1, l1);                        \
+        SWAP_ROW(u2, l2);                        \
+        SWAP_ROW(u3, l3);                        \
+    })
+
+#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \
+    ({                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                              \
+        tmp_swap;                                                                \
+        SWAP_ROW(u0, l0);                                                        \
+        SWAP_ROW(u1, l1);                                                        \
+        SWAP_ROW(u2, l2);                                                        \
+        SWAP_ROW(u3, l3);                                                        \
+        SWAP_ROW(u4, l4);                                                        \
+        SWAP_ROW(u5, l5);                                                        \
+        SWAP_ROW(u6, l6);                                                        \
+        SWAP_ROW(u7, l7);                                                        \
+    })
+
+#define TRANSPOSE_4x4(u0, u1, u2, u3) \
+    ({                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)   \
+        tmp;                          \
+        tmp.s012 = u0.s123;           \
+        u0.s1    = u1.s0;             \
+        u0.s2    = u2.s0;             \
+        u0.s3    = u3.s0;             \
+        u1.s0    = tmp.s0;            \
+        u2.s0    = tmp.s1;            \
+        u3.s0    = tmp.s2;            \
+        \
+        tmp.s01 = u1.s23;             \
+        u1.s2   = u2.s1;              \
+        u1.s3   = u3.s1;              \
+        u2.s1   = tmp.s0;             \
+        u3.s1   = tmp.s1;             \
+        \
+        tmp.s0 = u2.s3;               \
+        u2.s3  = u3.s2;               \
+        u3.s2  = tmp.s0;              \
+    })
+
+#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)                                             \
+    ({                                                                                            \
+        TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123);                                    \
+        TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567);                                    \
+        TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123);                                    \
+        TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567);                                    \
+        SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \
+    })
+
+#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)                                                \
+    ({                                                                                                                                       \
+        TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567);       \
+        TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF);       \
+        TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \
+        TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \
+        SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF,             \
+                 u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567);      \
+    })
+
+#ifndef DATA_TYPE_IN_BYTES
+#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
+#endif
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3)
+#define VLOAD(x, y) vload4(x, y)
+#define VSTORE(x, y, z) vstore4(x, y, z)
+#define BLOCK_SIZE 4
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)
+#define VLOAD(x, y) vload8(x, y)
+#define VSTORE(x, y, z) vstore8(x, y, z)
+#define BLOCK_SIZE 8
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)
+#define VLOAD(x, y) vload16(x, y)
+#define VSTORE(x, y, z) vstore16(x, y, z)
+#define BLOCK_SIZE 16
+#else
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(IMAGE_DECLARATION(src),
+                        IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0) * BLOCK_SIZE;
+    uint y = get_global_id(1) * BLOCK_SIZE;
+
+    // Compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Load the NxN block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+#if BLOCK_SIZE > 4
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7)));
+#if BLOCK_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15)));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+    // Transpose the block
+    TRANSPOSE();
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes;
+    VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y));
+    VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y));
+    VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y));
+    VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y));
+#if BLOCK_SIZE > 4
+    VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y));
+    VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y));
+    VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y));
+    VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y));
+#if BLOCK_SIZE == 16
+    VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y));
+    VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y));
+    VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y));
+    VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y));
+    VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y));
+    VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y));
+    VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y));
+    VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
new file mode 100644
index 0000000000..87736465d2
--- /dev/null
+++ b/src/core/CL/cl_kernels/types.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
new file mode 100644
index 0000000000..0a4748f452
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns a vector of floats contaning the matrix coefficients. */
+inline const float8 build_affine_mtx()
+{
+    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
+}
+
+/** Transforms 4 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   affine matrix
+ *
+ * @return a int8 containing 4 2D transformed values.
+ */
+inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   output(x,y) = input(x0,y0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
new file mode 100644
index 0000000000..26a8b859a4
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    const float4 clamped_x = clamp(coords.even, -1.0f, width);
+    const float4 clamped_y = clamp(coords.odd, -1.0f, height);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+*/
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+*/
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height));
+    const float16 c2 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height));
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
new file mode 100644
index 0000000000..863b6c9e96
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_perspective.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns the perspective matrix */
+inline const float16 build_perspective_mtx()
+{
+    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
+}
+
+/** Transforms four 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   (x0/z0,y0/z0)
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   perspective matrix
+ *
+ * @return a vector float8 containing four 2D transformed values.
+ */
+inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [z,z+1,z+2,z+3]
+    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+
+ *   output(x,y) = input(x0/z0,y0/z0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..685b8e234e
--- /dev/null
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "The output image can only be U8 if both input images are U8");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
new file mode 100644
index 0000000000..6333f04e71
--- /dev/null
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate"));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, alpha);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
new file mode 100644
index 0000000000..83bbe6a3be
--- /dev/null
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
+    build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.insert(("-DA=" + val_to_string(act_info.a())));
+    build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..aaa62d0268
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
+    {
+        ARM_COMPUTE_ERROR("You called with the wrong data types.");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..4c847276da
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    }
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..309a153b7a
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    // Create kernel
+    std::string kernel_name = "batchnormalization_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, _epsilon);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, vector_slice);
+    add_1D_tensor_argument(idx, _var, vector_slice);
+    add_1D_tensor_argument(idx, _beta, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..5ea4a86da5
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseAndKernel::CLBitwiseAndKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..0098e15ab6
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..2eeef0a993
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseOrKernel::CLBitwiseOrKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c19a78e1c4
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseXorKernel::CLBitwiseXorKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
new file mode 100644
index 0000000000..e113d30210
--- /dev/null
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLBox3x3Kernel::border_size() const
+{
+    return 1;
+}
+
+void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
+                                         "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
+                                         "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
+                                         "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
new file mode 100644
index 0000000000..5d06d34631
--- /dev/null
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGradientKernel::CLGradientKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
+                             "Gx and Gy must have the same pixel size");
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
+                             "Mag must have the same pixel size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
+    _kernel                       = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
+    : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
+
+    _magnitude = magnitude;
+    _phase     = phase;
+    _output    = output;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts));
+
+    // Set minimum threshold argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lower_thr);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration    = 1;
+    constexpr unsigned int num_elems_read_written_per_iteration = 3;
+
+    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
+                                     num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mag_access, phase_access, output_access);
+
+    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeTraceKernel::CLEdgeTraceKernel()
+    : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
+{
+}
+
+void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
+
+    _input            = input;
+    _output           = output;
+    _lower_thr        = lower_thr;
+    _upper_thr        = upper_thr;
+    _visited          = visited;
+    _recorded         = recorded;
+    _l1_stack         = l1_stack;
+    _l1_stack_counter = l1_stack_counter;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts));
+
+    // Set constant kernel args
+    unsigned int width  = _input->info()->dimension(0);
+    unsigned int height = _input->info()->dimension(1);
+    unsigned int idx    = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(width));
+    _kernel.setArg(idx++, static_cast<cl_uint>(height));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
+                              output_access,
+                              visited_access,
+                              recorded_access,
+                              l1_stack_access,
+                              l1_stack_counter_access);
+
+    output_access.set_valid_region(win, _input->info()->valid_region());
+    visited_access.set_valid_region(win, _input->info()->valid_region());
+    recorded_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _visited, slice);
+        add_2D_tensor_argument(idx, _recorded, slice);
+        add_2D_tensor_argument(idx, _l1_stack, slice);
+        add_2D_tensor_argument(idx, _l1_stack_counter, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
new file mode 100644
index 0000000000..d729ebcfb3
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelCombineKernel::CLChannelCombineKernel()
+    : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
+{
+}
+
+void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+
+    const Format fmt = output->info()->format();
+    _planes[0]       = plane0;
+    _planes[1]       = plane1;
+    _planes[2]       = plane2;
+    if(Format::RGBA8888 == fmt)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+        _planes[3] = plane3;
+    }
+    else
+    {
+        _planes[3] = nullptr;
+    }
+    _output       = output;
+    _output_multi = nullptr;
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+    }
+    else
+    {
+        _x_subsampling = { { 1, 1, 1 } };
+        _y_subsampling = { { 1, 1, 1 } };
+    }
+
+    // Create kernel
+    std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+                                                       plane1->info()->valid_region(),
+                                                       plane2->info()->valid_region());
+    if(plane3 != nullptr)
+    {
+        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+    }
+    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+
+    _planes[0]           = plane0;
+    _planes[1]           = plane1;
+    _planes[2]           = plane2;
+    _planes[3]           = nullptr;
+    _output              = nullptr;
+    _output_multi        = output;
+    bool has_two_planars = false;
+
+    // Set sub-sampling parameters for each plane
+    const Format          fmt = output->info()->format();
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+
+    if(Format::NV12 == fmt || Format::NV21 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+        kernel_name    = "channel_combine_NV";
+        build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+        has_two_planars = true;
+    }
+    else
+    {
+        if(Format::IYUV == fmt)
+        {
+            _x_subsampling = { { 1, 2, 2 } };
+            _y_subsampling = { { 1, 2, 2 } };
+        }
+        else
+        {
+            _x_subsampling = { { 1, 1, 1 } };
+            _y_subsampling = { { 1, 1, 1 } };
+        }
+
+        kernel_name = "copy_planes_3p";
+        build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowRectangle  output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
+    ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        // Subsampling in plane 1
+        Window win_sub_plane1(slice);
+        win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
+        win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
+
+        // Subsampling in plane 2
+        Window win_sub_plane2(slice);
+        win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
+        win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
+
+        unsigned int idx = 0;
+
+        // Set inputs
+        add_2D_tensor_argument(idx, _planes[0], slice);
+        add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
+        add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
+
+        if(nullptr != _planes[3])
+        {
+            add_2D_tensor_argument(idx, _planes[3], slice);
+        }
+
+        // Set outputs
+        if(nullptr != _output) // Single planar output
+        {
+            add_2D_tensor_argument(idx, _output, slice);
+        }
+        else // Multi-planar output
+        {
+            // Reduce slice in case of subsampling to avoid out-of bounds access
+            slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
+
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
+
+            if(3 == num_planes_from_format(_output_multi->info()->format()))
+            {
+                add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2);
+            }
+
+            _kernel.setArg(idx++, slice.y().end());
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
new file mode 100644
index 0000000000..541153316a
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelExtractKernel::CLChannelExtractKernel()
+    : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
+{
+}
+
+void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    _input  = input;
+    _output = output;
+
+    // Check format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Create kernel
+    std::string           kernel_name = "channel_extract_" + string_from_format(format);
+    std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
+    _kernel                           = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    ValidRegion input_valid_region = input->info()->valid_region();
+    output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    // Get format
+    const Format fmt = input->info()->format();
+
+    // Get input plane
+    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel));
+    ARM_COMPUTE_ERROR_ON(nullptr == input_plane);
+
+    _output      = output;
+    _input       = input_plane;
+    _subsampling = 1;
+
+    // Create kernel
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+    if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt)
+    {
+        kernel_name = "copy_plane";
+    }
+    else
+    {
+        kernel_name = "channel_extract_" + string_from_format(fmt);
+        build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input_plane->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window win_sub(slice);
+        win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
+        win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, win_sub);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
new file mode 100644
index 0000000000..ad66c39483
--- /dev/null
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+CLCol2ImKernel::CLCol2ImKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+    _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_2D();
+    Window slice_out = window.first_slice_window_3D();
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_in);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
new file mode 100644
index 0000000000..ead2b8f092
--- /dev/null
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <sstream>
+
+using namespace arm_compute;
+
+CLColorConvertKernel::CLColorConvertKernel()
+    : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
+{
+}
+
+void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input = input;
+    _output      = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    const bool  has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    const float sub_sampling   = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                         sub_sampling, sub_sampling);
+    AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                        sub_sampling, sub_sampling);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              plane0_access, plane1_access, plane2_access,
+                              output_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+    float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    switch(input->info()->format())
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 2;
+                    break;
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input        = input;
+    _multi_output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+    {
+        win.set_dimension_step(Window::DimY, 2);
+    }
+
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+    AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_plane0_access,
+                              output_plane1_access,
+                              output_plane2_access);
+
+    ValidRegion input_region = input->info()->valid_region();
+
+    output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::YUV444:
+                case Format::NV12:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input  = input;
+    _multi_output = output;
+
+    // Create kernel
+    bool has_two_input_planars  = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+
+    float sub_sampling_input  = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+    float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                               sub_sampling_input, sub_sampling_input);
+    AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                              sub_sampling_input, sub_sampling_input);
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+    AccessWindowRectangle  output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    if(nullptr != _input && nullptr != _output)
+    {
+        do
+        {
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _input && nullptr != _multi_output)
+    {
+        Format format = _multi_output->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _output)
+    {
+        Format format = _multi_input->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
+            }
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _multi_output)
+    {
+        Format in_format  = _multi_input->info()->format();
+        Format out_format = _multi_output->info()->format();
+        do
+        {
+            Window win_in_uv(slice);
+            if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
+            {
+                win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
+                                                              win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
+                win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
+            }
+
+            Window win_out_uv(slice);
+            if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
+            {
+                win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
+                                                               win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
+                win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
+            }
+
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
new file mode 100644
index 0000000000..bdfe398a1d
--- /dev/null
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+#define MAX_MATRIX_SIZE 81
+
+/****************************************************************************************\
+ *                                 Square Convolution                                *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+BorderSize             CLConvolutionKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+
+    _input  = input;
+    _output = output;
+
+    std::stringstream     kernel_name;
+    std::set<std::string> options;
+    kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
+
+    if(scale == 0)
+    {
+        scale = calculate_matrix_scale(conv, matrix_size);
+    }
+
+    for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << conv[i];
+        options.insert(mat_str.str());
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(out_type.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Separable Convolution                                *
+\****************************************************************************************/
+template <unsigned int matrix_size>
+CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
+    : _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=0");
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
+                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+    ARM_COMPUTE_ERROR_ON(scale == 0);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=" + val_to_string(scale));
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+
+    build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    build_opts.insert(out_type.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
+    : _border_size(0), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLConvolutionRectangleKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == conv);
+    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
+    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+    ARM_COMPUTE_ERROR_ON(0 == scale);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(height / 2, width / 2);
+
+    std::set<std::string> options;
+
+    std::stringstream output_type;
+    output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(output_type.str());
+
+    uint32_t matrix_size = width * height;
+
+    int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+    {
+        options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
+    options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    const unsigned int     num_rows_read_per_iteration       = height;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+template class arm_compute::CLConvolutionKernel<3>;
+template class arm_compute::CLConvolutionKernel<5>;
+template class arm_compute::CLConvolutionKernel<7>;
+template class arm_compute::CLConvolutionKernel<9>;
+template class arm_compute::CLSeparableConvolutionVertKernel<5>;
+template class arm_compute::CLSeparableConvolutionVertKernel<7>;
+template class arm_compute::CLSeparableConvolutionVertKernel<9>;
+template class arm_compute::CLSeparableConvolutionHorKernel<5>;
+template class arm_compute::CLSeparableConvolutionHorKernel<7>;
+template class arm_compute::CLSeparableConvolutionHorKernel<9>;
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..73f1ba15df
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           output->info()->strides_in_bytes()[1];
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
new file mode 100644
index 0000000000..24608bd17c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+
+    // Check if convertion is supported
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
+                                                                            && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] S16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] U32 ->  [out] U8, U16, S16");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] S32 ->  [out] U8, U16, S16");
+
+    // Get data sizes
+    const size_t input_size  = data_size_from_type(input->info()->data_type());
+    const size_t output_size = data_size_from_type(output->info()->data_type());
+
+    // Construct kernel name and build options
+    std::string           kernel_name = "convert_depth";
+    std::set<std::string> build_opts;
+    if(input_size > output_size)
+    {
+        kernel_name += "_down";
+        build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE");
+    }
+    else
+    {
+        kernel_name += "_up";
+    }
+    build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set shift arg
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
new file mode 100644
index 0000000000..36ba06d528
--- /dev/null
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLDerivativeKernel::CLDerivativeKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
+{
+}
+
+BorderSize CLDerivativeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_derivative_x = output_x != nullptr;
+    _run_derivative_y = output_y != nullptr;
+
+    if(_run_derivative_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_derivative_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_derivative_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_derivative_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_read_rows_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, 0, 0, 0);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+    if(_run_derivative_x && _run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+    else if(_run_derivative_x)
+    {
+        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration);
+    }
+    else if(_run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_derivative_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_derivative_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
new file mode 100644
index 0000000000..3abd747011
--- /dev/null
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLDilateKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
new file mode 100644
index 0000000000..a7aa88fc5c
--- /dev/null
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLErodeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_pes_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
new file mode 100644
index 0000000000..1d4d776730
--- /dev/null
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLFastCornersKernel::CLFastCornersKernel()
+    : ICLKernel(), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLFastCornersKernel::border_size() const
+{
+    return BorderSize(3);
+}
+
+void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> build_opts;
+
+    if(non_max_suppression)
+    {
+        build_opts.emplace("-DUSE_MAXSUPPRESSION");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 7;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLCopyToArrayKernel::CLCopyToArrayKernel()
+    : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
+{
+}
+
+void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(corners == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
+
+    _input      = input;
+    _corners    = corners;
+    _num_buffer = num_buffers;
+
+    std::set<std::string> build_opts;
+
+    if(update_number)
+    {
+        build_opts.emplace("-DUPDATE_NUMBER");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts));
+
+    //Get how many pixels skipped in the x dimension in the previous stages
+    unsigned int offset = _input->info()->valid_region().anchor.x();
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx++, corners->max_num_values());
+    _kernel.setArg<cl_uint>(idx++, offset);
+    _kernel.setArg(idx++, *_num_buffer);
+    _kernel.setArg(idx++, _corners->cl_buffer());
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    //Initialise the _num_buffer as it used as both input and output
+    static const unsigned int zero_init = 0;
+    queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
new file mode 100644
index 0000000000..981aad665a
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLFillBorderKernel::CLFillBorderKernel()
+    : ICLKernel(), _tensor(nullptr)
+{
+}
+
+bool CLFillBorderKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <class T>
+void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+    T value;
+    constant_border_value.get(value);
+    ICLKernel::add_argument<T>(idx, static_cast<T>(value));
+}
+
+void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+    border_size.limit(tensor->info()->padding());
+
+    // If there is no border: early exit
+    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+
+    // Select appropriate kernel
+    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+    // Define select type required by replicate border > 1
+    const DataType dt          = tensor->info()->data_type();
+    std::string    select_type = get_cl_type_from_data_type(dt);
+    if(is_data_type_float(dt))
+    {
+        select_type = (DataType::F32 == dt) ? "int" : "short";
+    }
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.emplace(("-DSELECT_TYPE=" + select_type));
+    build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
+    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
+    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
+    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _tensor = tensor;
+
+    // Create static kernel arguments
+    const unsigned int valid_width  = tensor->info()->valid_region().shape[0];
+    const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+    const cl_int2      valid_region_coords =
+    {
+        {
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+        }
+    };
+    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    ICLKernel::add_argument<cl_uint>(idx, valid_width);
+    ICLKernel::add_argument<cl_uint>(idx, valid_height);
+    ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
+    if(BorderMode::CONSTANT == border_mode)
+    {
+        switch(dt)
+        {
+            case DataType::U8:
+                set_constant_border<uint8_t>(idx, constant_border_value);
+                break;
+            case DataType::U16:
+                set_constant_border<uint16_t>(idx, constant_border_value);
+                break;
+            case DataType::S16:
+                set_constant_border<int16_t>(idx, constant_border_value);
+                break;
+            case DataType::U32:
+                set_constant_border<uint32_t>(idx, constant_border_value);
+                break;
+            case DataType::S32:
+                set_constant_border<int32_t>(idx, constant_border_value);
+                break;
+            case DataType::F32:
+                static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                set_constant_border<float>(idx, constant_border_value);
+                break;
+            case DataType::F16:
+                static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
+                set_constant_border<cl_half>(idx, constant_border_value);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not handled");
+        }
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+    ICLKernel::configure(win);
+}
+
+void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel() == nullptr)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _tensor, slice);
+        enqueue(queue, *this, slice, cl::NullRange);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..71d42c5606
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::string data_type_name;
+    data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+    _kernel        = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    /*
+     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     */
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = window.first_slice_window_2D();
+
+    // Change x and y steps for the slide of output tensor
+    out_slice.scale(Window::DimX, 4.f);
+    out_slice.scale(Window::DimY, 0.25f);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..c6e05b92a2
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Create kernel and set static arguments
+    std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+    unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<int32_t>(idx++, a_offset);
+    _kernel.setArg<int32_t>(idx++, b_offset);
+    _kernel.setArg<int32_t>(idx++, output_offset);
+    _kernel.setArg<int32_t>(idx++, output_mult_int);
+    _kernel.setArg<int32_t>(idx++, shift);
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..289873c23f
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+
+    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, biases_access, accum_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window accum_slice = window.first_slice_window_2D();
+
+    Window biases_slice(accum_slice);
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _accum, accum_slice);
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+
+        enqueue(queue, *this, accum_slice);
+    }
+    while(window.slide_window_slice_2D(accum_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..343838f2f9
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    std::ostringstream ma_arguments;
+    ma_arguments << "-DBETA=" << beta;
+    std::set<std::string> build_opts;
+    build_opts.emplace(ma_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..d7388e8579
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    if(output->info()->dimension(1) == 1)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream mm_arguments;
+    mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
+    mm_arguments << "-DALPHA=" << alpha << " ";
+    std::set<std::string> build_opts;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(output->info()->dimension(1) == 1)
+    {
+        mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+        _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
+
+        // Configure window kernel
+        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+
+        if(data_type_name == "f32")
+        {
+            GPUTarget arch_target = get_arch_from_target(get_target());
+            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+        }
+        else
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+        }
+
+        // Configure window kernel
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+}
+
+void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ecee1abd72
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+     */
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    std::string kernel_name    = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    float scale_x = 1.f;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            scale_x = 16.f;
+            break;
+        case DataType::F16:
+            scale_x = 8.f;
+            break;
+        case DataType::F32:
+            scale_x = 4.f;
+            break;
+        default:
+            // Do nothing
+            break;
+    }
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..e5bc3f9656
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLGaussian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
+                                         "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
+                                         "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
+                                         "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..bd523c883d
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+}
+
+void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const uint32_t scale    = 256;
+    const int16_t  matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..34a228c717
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
+    : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidHorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 20;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr float        scale_x                           = 0.5f;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+    // a pixel is even or odd is determined based on the tensor shape not the
+    // valid region!)
+    // Thus the offset from which the first pixel (L2) for the convolution is
+    // loaded depends on the anchor and shape of the valid region.
+    // In the case of an even shape (= even image width) we need to load L2
+    // from -2 if the anchor is odd and from -1 if the anchor is even. That
+    // makes sure that L2 is always loaded from an odd pixel.
+    // On the other hand, for an odd shape (= odd image width) we need to load
+    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+    // achieve the opposite effect.
+    // The condition can be simplified to checking whether anchor + shape is
+    // odd (-2) or even (-1) as only adding an odd and an even number will have
+    // an odd result.
+    _l2_load_offset = -border_size().left;
+
+    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+    {
+        _l2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, _l2_load_offset);
+
+    //The output is half the width of the input:
+    Window win_out(window);
+    win_out.scale(Window::DimX, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
+
+CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
+    : _t2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidVertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_rows_processed_per_iteration  = 2;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_per_iteration            = 5;
+    constexpr float        scale_y                           = 0.5f;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
+                                      border_undefined, border_size());
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
+
+    // Determine whether we need to load even or odd rows. See above for a
+    // detailed explanation.
+    _t2_load_offset = -border_size().top;
+
+    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+    {
+        _t2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
+    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+
+    Window win_in(window);
+    win_in.shift(Window::DimY, _t2_load_offset);
+
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..87659c4ba9
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_size       = hog_info->cell_size();
+
+    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    std::stringstream args_str;
+    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_mag_phase = window.first_slice_window_2D();
+        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+
+    std::stringstream args_str;
+    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_in = window.first_slice_window_2D();
+        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..0f9a98950d
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+                                    float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                 = input;
+    _detection_windows     = detection_windows;
+    _num_detection_windows = num_detection_windows;
+
+    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    std::stringstream args_str;
+    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+    args_str << "-DTHRESHOLD=" << threshold << " ";
+    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+    args_str << "-DIDX_CLASS=" << idx_class << " ";
+    args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+    args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+    _kernel.setArg(idx++, hog->cl_buffer());
+    _kernel.setArg(idx++, detection_windows->cl_buffer());
+    _kernel.setArg(idx++, *_num_detection_windows);
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..9fc34a7760
--- /dev/null
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHarrisScoreKernel::CLHarrisScoreKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
+{
+}
+
+BorderSize CLHarrisScoreKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                                    bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    // Select kernel
+    std::stringstream harris_score_kernel_name;
+    harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
+
+    // Create build options
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, sensitivity);
+    _kernel.setArg(idx++, strength_thresh);
+    _kernel.setArg(idx++, norm_factor);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_written_per_iteration   = 4;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
new file mode 100644
index 0000000000..87ee5fb74e
--- /dev/null
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <string>
+
+using namespace arm_compute;
+
+// each thread handle 16 pixels
+constexpr signed int pixels_per_item = 16;
+
+// local work group size in X dimension
+constexpr unsigned int local_x_size = 16;
+
+CLHistogramKernel::CLHistogramKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    _input  = input;
+    _output = output;
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+    unsigned int bin_size    = _output->size();
+    unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, buffer_size, nullptr);
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(0, end_position, pixels_per_item));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
+
+    ICLKernel::configure(win);
+}
+
+void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    _output->map(queue, true);
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->size());
+    _output->unmap(queue);
+
+    Window      slice = window.first_slice_window_2D();
+    cl::NDRange lws   = cl::NDRange(local_x_size, 1);
+
+    do
+    {
+        /* Run the core part which has width can be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHistogramBorderKernel::CLHistogramBorderKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    if(start_position >= input->info()->dimension(0))
+    {
+        return; // no need to run histogram border kernel
+    }
+
+    _input  = input;
+    _output = output;
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
+    ICLKernel::configure(win);
+}
+
+void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    if(window.x().start() >= window.x().end())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    cl::NDRange lws = cl::NDRange(1, 1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        /* Run the border part which has width cannot be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
new file mode 100644
index 0000000000..8c0fe26666
--- /dev/null
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLIm2ColKernel::CLIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                    input->info()->tensor_shape().cend(),
+                                                    output->info()->tensor_shape().cbegin() + 1))
+                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    if(!run_img2col_reduced)
+    {
+        _convolved_dims                    = convolved_dims;
+        _conv_info                         = conv_info;
+        _kernel_size                       = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+        _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+        // Create static kernel arguments
+        const cl_int2 input_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)),
+                static_cast<cl_int>(input->info()->dimension(1)),
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                stride_x,
+                stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pad_x,
+                pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int>(idx++, _kernel_size);
+        _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
+        _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
+        _kernel.setArg<cl_int2>(idx++, input_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+
+        _run_func = &CLIm2ColKernel::run_generic;
+    }
+    else
+    {
+        _num_elems_processed_per_iteration = 1;
+        _kernel                            = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+        _run_func                          = &CLIm2ColKernel::run_reduced;
+    }
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+    (this->*_run_func)(window, queue);
+}
+
+void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    std::tie(stride_x, stride_y) = _conv_info.stride();
+
+    // Get initial windows
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+}
+
+void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_3D();
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _output, out_slice);
+
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
new file mode 100644
index 0000000000..69ede457df
--- /dev/null
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+CLIntegralImageVertKernel::CLIntegralImageVertKernel()
+    : _in_out(nullptr)
+{
+}
+
+void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
+
+    _in_out = in_out;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
+
+    Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, in_out_access);
+
+    in_out_access.set_valid_region(win, in_out->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const size_t height = _in_out->info()->dimension(1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _in_out, slice);
+        _kernel.setArg<cl_uint>(idx++, height);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
new file mode 100644
index 0000000000..12cdd0ec93
--- /dev/null
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                                      ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                      bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
+
+{
+    ARM_COMPUTE_ERROR_ON(old_points == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+
+    const float scale = std::pow(pyramid_scale, level);
+
+    // Create kernel
+    std::string kernel_name = "init_level";
+    if(level == (num_levels - 1))
+    {
+        kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    if(level == (num_levels - 1))
+    {
+        _kernel.setArg(idx++, old_points->cl_buffer());
+        if(use_initial_estimate)
+        {
+            _kernel.setArg(idx++, new_points_estimates->cl_buffer());
+        }
+    }
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg<cl_float>(idx++, scale);
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
+
+{
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points == nullptr);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize"));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points->cl_buffer());
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
+    : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
+{
+}
+
+void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                                        ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                        ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _old_input     = old_input;
+    _old_scharr_gx = old_scharr_gx;
+    _old_scharr_gy = old_scharr_gy;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion valid_region = intersect_valid_regions(
+                                         old_input->info()->valid_region(),
+                                         old_scharr_gx->info()->valid_region(),
+                                         old_scharr_gy->info()->valid_region());
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0"));
+
+    // Set arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+}
+
+void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _old_input, window);
+    add_2D_tensor_argument(idx, _old_scharr_gx, window);
+    add_2D_tensor_argument(idx, _old_scharr_gy, window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
+    : _new_input(nullptr)
+{
+}
+
+void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _new_input = new_input;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion &valid_region = new_input->info()->valid_region();
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+    const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+    const int term_epsilon   = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_int>(idx++, num_iterations);
+    _kernel.setArg<cl_float>(idx++, epsilon);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+    _kernel.setArg<cl_int>(idx++, term_iteration);
+    _kernel.setArg<cl_int>(idx++, term_epsilon);
+}
+
+void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _new_input, window);
+
+    enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..794a1bc56e
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream    mm_arguments;
+    std::set<std::string> build_opts;
+
+    mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+    build_opts.emplace(mm_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+    // Configure window kernel
+    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    Window matrix_b_window;
+    matrix_b_window.use_tensor_dimensions(_input1->info());
+    Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..c504189169
--- /dev/null
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
+{
+}
+
+void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                                       MagnitudeType mag_type, PhaseType phase_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+
+    _run_mag   = (magnitude != nullptr);
+    _run_phase = (phase != nullptr);
+    if(_run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
+    }
+    if(_run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    }
+
+    if(!_run_mag && !_run_phase)
+    {
+        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+
+    // Add magnitude type
+    if(_run_mag)
+    {
+        switch(mag_type)
+        {
+            case MagnitudeType::L1NORM:
+                build_opts.insert("-DMAGNITUDE=1");
+                break;
+            case MagnitudeType::L2NORM:
+                build_opts.insert("-DMAGNITUDE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
+                build_opts.insert("-DMAGNITUDE=0");
+                break;
+        }
+    }
+
+    // Add phase type
+    if(_run_phase)
+    {
+        switch(phase_type)
+        {
+            case PhaseType::UNSIGNED:
+                build_opts.insert("-DPHASE=1");
+                break;
+            case PhaseType::SIGNED:
+                build_opts.insert("-DPHASE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
+                build_opts.insert("-DPHASE=0");
+                break;
+        }
+    }
+
+    // Add data_type
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              gx_access, gy_access,
+                              output_magnitude_access, output_phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+    output_magnitude_access.set_valid_region(win, valid_region);
+    output_phase_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+
+        if(_run_mag)
+        {
+            add_2D_tensor_argument(idx, _magnitude, slice);
+        }
+
+        if(_run_phase)
+        {
+            add_2D_tensor_argument(idx, _phase, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..b0b748f466
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMeanStdDevKernel::CLMeanStdDevKernel()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+{
+}
+
+void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == mean);
+    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+
+    _input              = input;
+    _mean               = mean;
+    _stddev             = stddev;
+    _global_sum         = global_sum;
+    _global_sum_squared = global_sum_squared;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    if(_stddev != nullptr)
+    {
+        build_opts.insert("-DSTDDEV");
+    }
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
+
+    _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
+    _kernel.setArg(idx++, *_global_sum);
+
+    if(_stddev != nullptr)
+    {
+        _kernel.setArg(idx++, *_global_sum_squared);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    update_window_and_padding(win, input_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Clear sums
+    static const cl_ulong zero = 0;
+    queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+    }
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        // Set slice step equal to height to force gws[1] to 1,
+        // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
+        slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+
+    // Calculate mean and stddev
+    cl_ulong    global_sum         = 0;
+    cl_ulong    global_sum_squared = 0;
+    const float num_pixels         = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+    queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
+    const float mean = global_sum / num_pixels;
+    *_mean           = mean;
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
+        *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
+    }
+}
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..95334c7b5f
--- /dev/null
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLMedian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" }));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..939a53b03a
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxKernel::CLMinMaxKernel()
+    : _input(nullptr), _min_max(), _data_type_max_min()
+{
+}
+
+void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+
+    _input                                               = input;
+    _min_max                                             = min_max;
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            _data_type_max_min[0] = UCHAR_MAX;
+            _data_type_max_min[1] = 0;
+            break;
+        case DataType::S16:
+            _data_type_max_min[0] = SHRT_MAX;
+            _data_type_max_min[1] = SHRT_MIN;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong image data types");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
+    build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
+    build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *_min_max);
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Reset mininum and maximum values
+    queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLMinMaxLocationKernel::CLMinMaxLocationKernel()
+    : _input(nullptr), _min_max_count(nullptr)
+{
+}
+
+void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+    ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
+
+    _input         = input;
+    _min_max_count = min_max_count;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
+    build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
+    build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *min_max);
+    _kernel.setArg(idx++, *min_max_count);
+    if(min_loc != nullptr)
+    {
+        _kernel.setArg(idx++, min_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
+    }
+    if(max_loc != nullptr)
+    {
+        _kernel.setArg(idx++, max_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    static const unsigned int zero_count = 0;
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..6afa5822ba
--- /dev/null
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLNonLinearFilterKernel::CLNonLinearFilterKernel()
+    : _border_size(0)
+{
+}
+
+BorderSize CLNonLinearFilterKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                        bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
+    ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
+    ARM_COMPUTE_UNUSED(mask);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(mask_size / 2);
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
+
+    // Define kernel
+    std::string pattern_name = string_from_matrix_pattern(pattern);
+    std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
+    std::stringstream ss;
+    ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    const unsigned int     num_rows_read_per_iteration       = mask_size;
+
+    Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..6a96b0effd
--- /dev/null
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..106a5113db
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerKernel::CLNormalizationLayerKernel()
+    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize CLNormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input         = input;
+    _squared_input = squared_input;
+    _output        = output;
+
+    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    // Create kernel
+    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
+    _kernel.setArg<cl_float>(idx++, norm_info.beta());
+    _kernel.setArg<cl_float>(idx++, norm_info.kappa());
+    _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _squared_input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..84eb434bc9
--- /dev/null
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    int scale_int = -1;
+    // Extract sign, exponent and mantissa
+    int   exponent            = 0;
+    float normalized_mantissa = std::frexp(scale, &exponent);
+    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+    // Moreover, it will be negative as we deal with 1/2^n
+    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    {
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_int = std::abs(exponent - 1);
+    }
+
+    std::string data_type;
+    std::string compute_type;
+    // Check if it has float inputs and output
+    if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
+    {
+        scale_int    = -1;
+        compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+        data_type    = "DATA_TYPE_FLOAT";
+    }
+    else
+    {
+        compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
+        data_type    = "DATA_TYPE_INT";
+    }
+
+    // Construct kernel name
+    std::string kernel_name = "pixelwise_mul";
+    kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+    build_opts.emplace("-D" + data_type);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set scale argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+
+    if(scale_int >= 0)
+    {
+        _kernel.setArg(idx++, scale_int);
+    }
+    else
+    {
+        _kernel.setArg(idx++, scale);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..dc5ae4ec7a
--- /dev/null
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLPoolingLayerKernel::CLPoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+{
+}
+
+BorderSize CLPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                   pool_pad_x      = 0;
+    int                   pool_pad_y      = 0;
+    int                   pool_stride_x   = 0;
+    int                   pool_stride_y   = 0;
+    unsigned int          pooled_w        = 0;
+    unsigned int          pooled_h        = 0;
+    const PoolingType     pool_type       = pool_info.pool_type();
+    const int             pool_size       = pool_info.pool_size();
+    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
+    DimensionRoundingType pool_round      = pad_stride_info.round();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_stride_x, pool_stride_y,
+                                                     pool_pad_x, pool_pad_y,
+                                                     pool_round);
+    ARM_COMPUTE_UNUSED(pooled_w);
+    ARM_COMPUTE_UNUSED(pooled_h);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    const int input_width   = input->info()->dimension(0);
+    const int input_height  = input->info()->dimension(1);
+    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    // Set instance variables
+    _input              = input;
+    _output             = output;
+    _pool_info          = pool_info;
+    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
+    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+
+    // Create kernel
+    std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set static kernel arguments
+    if(pool_type == PoolingType::AVG)
+    {
+        // Create static kernel arguments
+        const cl_int2 max_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
+                static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                pool_stride_x,
+                pool_stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pool_pad_x,
+                pool_pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = 2 * num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int2>(idx++, max_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        // Upsample input by pool size
+        Window in_slice(slice);
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
new file mode 100644
index 0000000000..e63a5ef7c6
--- /dev/null
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLRemapKernel::CLRemapKernel()
+    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize CLRemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "remap_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window             win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input_access(output->info(), -border_offset, -border_offset,
+                                    _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
+}
+
+void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _map_x, slice);
+        add_2D_tensor_argument(idx, _map_y, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
new file mode 100644
index 0000000000..d74e837ace
--- /dev/null
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    /* Compute the ratio between source width/height and destination width/height */
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+    }
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "scale_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
+                                    input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<float>(idx++, input->info()->dimension(1));
+    _kernel.setArg<float>(idx++, output->info()->dimension(0));
+    _kernel.setArg<float>(idx++, output->info()->dimension(1));
+}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..913ef592d4
--- /dev/null
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLScharr3x3Kernel::CLScharr3x3Kernel()
+    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize CLScharr3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_scharr_x = output_x != nullptr;
+    _run_scharr_y = output_y != nullptr;
+
+    if(_run_scharr_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_scharr_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_scharr_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_scharr_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_scharr_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_scharr_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
new file mode 100644
index 0000000000..436aaa498a
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel3x3Kernel::CLSobel3x3Kernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
new file mode 100644
index 0000000000..4c0316f19e
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel5x5HorKernel::CLSobel5x5HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel5x5VertKernel::CLSobel5x5VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
new file mode 100644
index 0000000000..a477953cfb
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel7x7HorKernel::CLSobel7x7HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel7x7HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+    // Construct kernel name
+    std::string kernel_name = "sobel_separable1x7";
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel7x7VertKernel::CLSobel7x7VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel7x7VertKernel::border_size() const
+{
+    return BorderSize(3, 0);
+}
+
+void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..0470d5243e
--- /dev/null
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_iteration = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _max, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLLogits1DNormKernel::CLLogits1DNormKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _sum, sum_slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
new file mode 100644
index 0000000000..bbdaa37410
--- /dev/null
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstdint>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+    ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    // Create kernel
+    std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set lut argument
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lut->cl_buffer());
+    if(DataType::S16 == lut->type())
+    {
+        _kernel.setArg(idx++, lut->index_offset());
+        _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
+    }
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
new file mode 100644
index 0000000000..6e07cefc77
--- /dev/null
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                                  uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Construct kernel name
+    std::string kernel_name = "threshold";
+
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            kernel_name += "_binary";
+            break;
+        case ThresholdType::RANGE:
+            kernel_name += "_range";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognized");
+            break;
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, false_value);
+    _kernel.setArg(idx++, true_value);
+    _kernel.setArg(idx++, threshold);
+
+    if(ThresholdType::RANGE == type)
+    {
+        _kernel.setArg(idx++, upper);
+    }
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
new file mode 100644
index 0000000000..2ee6fcb9dc
--- /dev/null
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input    = input;
+    _output   = output;
+    _lws_hint = cl::NDRange(2, 8);
+
+    std::set<std::string> build_opts;
+    std::ostringstream    data_type_in_bytes;
+    data_type_in_bytes << input->info()->element_size();
+    build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size();
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
new file mode 100644
index 0000000000..e549dbc258
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpAffineKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 6);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_affine_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
new file mode 100644
index 0000000000..fddb580750
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpPerspectiveKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 9);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_perspective_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..018f272921
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    if(_is_shared)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+    }
+
+    // Check biases
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    }
+
+    _biases = biases;
+    _output = output;
+    _input  = input;
+
+    // Create build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    // Set arguments
+    unsigned idx = 0;
+    add_3D_tensor_argument(idx, _input, in_slice);
+    add_2D_tensor_argument(idx, _output, out_slice);
+    if(_biases != nullptr)
+    {
+        Window biases_slice;
+        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+    }
+
+    // Run kernel
+    enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    if(_biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(_biases->info());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, biases_slice);
+            biases_window.slide_window_slice_1D(biases_slice);
+        }
+
+        // Run kernel
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
new file mode 100644
index 0000000000..9d18a9c165
--- /dev/null
+++ b/src/core/CPP/ICPPSimpleKernel.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+
+using namespace arm_compute;
+
+ICPPSimpleKernel::ICPPSimpleKernel()
+    : _input{ nullptr }, _output{ nullptr }
+{
+}
+
+void ICPPSimpleKernel::configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    ICPPKernel::configure(win);
+}
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
new file mode 100644
index 0000000000..884da2861b
--- /dev/null
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+namespace
+{
+inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+{
+    if(strength != 0.0f)
+    {
+        /* Set index and update num_corner_candidate */
+        std::unique_lock<std::mutex> lock(*corner_candidates_mutex);
+
+        const int32_t idx = *num_corner_candidates;
+
+        *num_corner_candidates += 1;
+
+        lock.unlock();
+
+        /* Add keypoint */
+        output[idx] = std::make_tuple(x, y, strength);
+    }
+}
+
+inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+{
+    check_corner(x + 0, y, *(input + 0), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 1, y, *(input + 1), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 2, y, *(input + 2), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 3, y, *(input + 3), output, num_corner_candidates, corner_candidates_mutex);
+}
+} // namespace
+
+bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
+{
+    return std::get<2>(lhs) > std::get<2>(rhs);
+}
+
+CPPCornerCandidatesKernel::CPPCornerCandidatesKernel()
+    : _num_corner_candidates(nullptr), _corner_candidates_mutex(), _input(nullptr), _output(nullptr)
+{
+}
+
+void CPPCornerCandidatesKernel::configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
+    ARM_COMPUTE_ERROR_ON(*num_corner_candidates != 0);
+
+    _input                 = input;
+    _output                = output;
+    _num_corner_candidates = num_corner_candidates;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void CPPCornerCandidatesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        corner_candidates(reinterpret_cast<float *>(input.ptr()), &_output[0], id.x(), id.y(), _num_corner_candidates, &_corner_candidates_mutex);
+    },
+    input);
+}
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
new file mode 100644
index 0000000000..62bfdd60ba
--- /dev/null
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+bool compare_detection_window(const DetectionWindow &lhs, const DetectionWindow &rhs)
+{
+    return lhs.score > rhs.score;
+}
+} // namespace
+
+CPPDetectionWindowNonMaximaSuppressionKernel::CPPDetectionWindowNonMaximaSuppressionKernel()
+    : _input_output(nullptr), _min_distance(0.0f)
+{
+}
+
+bool CPPDetectionWindowNonMaximaSuppressionKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPDetectionWindowNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input_output);
+
+    _input_output = input_output;
+    _min_distance = min_distance;
+
+    IKernel::configure(Window()); // Default 1 iteration window
+}
+
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);
+
+    const size_t num_candidates = _input_output->num_values();
+    size_t       num_detections = 0;
+
+    // Sort list of candidates
+    std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
+
+    const float min_distance_pow2 = _min_distance * _min_distance;
+
+    // Euclidean distance
+    for(size_t i = 0; i < num_candidates; ++i)
+    {
+        if(0.0f != _input_output->at(i).score)
+        {
+            DetectionWindow cur;
+            cur.x         = _input_output->at(i).x;
+            cur.y         = _input_output->at(i).y;
+            cur.width     = _input_output->at(i).width;
+            cur.height    = _input_output->at(i).height;
+            cur.idx_class = _input_output->at(i).idx_class;
+            cur.score     = _input_output->at(i).score;
+
+            // Store window
+            _input_output->at(num_detections) = cur;
+
+            ++num_detections;
+
+            const float xc = cur.x + cur.width * 0.5f;
+            const float yc = cur.y + cur.height * 0.5f;
+
+            for(size_t k = i + 1; k < num_candidates; ++k)
+            {
+                const float xn = _input_output->at(k).x + _input_output->at(k).width * 0.5f;
+                const float yn = _input_output->at(k).y + _input_output->at(k).height * 0.5f;
+
+                const float dx = std::fabs(xn - xc);
+                const float dy = std::fabs(yn - yc);
+
+                if(dx < _min_distance && dy < _min_distance)
+                {
+                    const float d = dx * dx + dy * dy;
+
+                    if(d < min_distance_pow2)
+                    {
+                        // Invalidate keypoint
+                        _input_output->at(k).score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    _input_output->resize(num_detections);
+}
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
new file mode 100644
index 0000000000..09d3ccffa4
--- /dev/null
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
+{
+    return std::get<2>(lhs) > std::get<2>(rhs);
+}
+} // namespace
+
+CPPSortEuclideanDistanceKernel::CPPSortEuclideanDistanceKernel()
+    : _num_corner_candidates(), _min_distance(0.0f), _in_out(nullptr), _output(nullptr)
+{
+}
+
+void CPPSortEuclideanDistanceKernel::configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == in_out);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
+    ARM_COMPUTE_ERROR_ON(!((min_distance > 0) && (min_distance <= 30)));
+
+    _in_out                = in_out;
+    _output                = output;
+    _min_distance          = min_distance * min_distance; // We compare squares of distances
+    _num_corner_candidates = num_corner_candidates;
+    ICPPKernel::configure(Window()); // Default 1 iteration window
+}
+
+bool CPPSortEuclideanDistanceKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPSortEuclideanDistanceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
+
+    const int32_t num_corner_candidates = *_num_corner_candidates;
+
+    /* Sort list of corner candidates */
+    std::sort(_in_out, _in_out + num_corner_candidates, keypoint_compare);
+
+    /* Euclidean distance */
+    for(int32_t i = 0; i < num_corner_candidates; ++i)
+    {
+        if(std::get<2>(_in_out[i]) != 0.0f)
+        {
+            KeyPoint   keypt;
+            const auto xc = std::get<0>(_in_out[i]);
+            const auto yc = std::get<1>(_in_out[i]);
+
+            keypt.x               = xc;
+            keypt.y               = yc;
+            keypt.strength        = std::get<2>(_in_out[i]);
+            keypt.tracking_status = 1;
+
+            /* Store corner */
+            _output->push_back(keypt);
+            for(int32_t k = i + 1; k < num_corner_candidates; ++k)
+            {
+                const float dx = std::fabs(std::get<0>(_in_out[k]) - xc);
+                const float dy = std::fabs(std::get<1>(_in_out[k]) - yc);
+
+                if((dx < _min_distance) && (dy < _min_distance))
+                {
+                    const float d = (dx * dx + dy * dy);
+
+                    if(d < _min_distance)
+                    {
+                        /* Invalidate keypoint */
+                        std::get<2>(_in_out[k]) = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
new file mode 100644
index 0000000000..389e390736
--- /dev/null
+++ b/src/core/Error.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+
+void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    char    out[512];
+    va_list args;
+    va_start(args, msg);
+    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    va_end(args);
+
+    throw std::runtime_error(std::string(out));
+}
+
+void arm_compute::debug(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    char    out[512];
+    va_list args;
+    va_start(args, msg);
+    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    va_end(args);
+    std::cout << std::string(out) << std::endl;
+}
diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
new file mode 100644
index 0000000000..1b6175e68f
--- /dev/null
+++ b/src/core/HOGInfo.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/HOGInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+HOGInfo::HOGInfo()
+    : _cell_size(), _block_size(), _detection_window_size(), _block_stride(), _num_bins(0), _normalization_type(HOGNormType::L2HYS_NORM), _l2_hyst_threshold(0.0f), _phase_type(PhaseType::UNSIGNED),
+      _descriptor_size(0)
+{
+}
+
+HOGInfo::HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+                 HOGNormType normalization_type, float l2_hyst_threshold, PhaseType phase_type)
+    : HOGInfo()
+{
+    init(cell_size, block_size, detection_window_size, block_stride, num_bins, normalization_type, l2_hyst_threshold, phase_type);
+}
+
+void HOGInfo::init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+                   HOGNormType normalization_type, float l2_hyst_threshold, PhaseType phase_type)
+{
+    ARM_COMPUTE_ERROR_ON_MSG((block_size.width % cell_size.width), "The block width must be multiple of cell width");
+    ARM_COMPUTE_ERROR_ON_MSG((block_size.height % cell_size.height), "Block height must be multiple of cell height");
+    ARM_COMPUTE_ERROR_ON_MSG((block_stride.width % cell_size.width), "Block stride width must be multiple of cell width");
+    ARM_COMPUTE_ERROR_ON_MSG((block_stride.height % cell_size.height), "Block stride height must be multiple of cell height");
+    ARM_COMPUTE_ERROR_ON_MSG(((detection_window_size.width - block_size.width) % block_stride.width), "Window width must be multiple of block width and block stride width");
+    ARM_COMPUTE_ERROR_ON_MSG(((detection_window_size.height - block_size.height) % block_stride.height), "Window height must be multiple of block height and block stride height");
+
+    _cell_size             = cell_size;
+    _block_size            = block_size;
+    _detection_window_size = detection_window_size;
+    _block_stride          = block_stride;
+    _num_bins              = num_bins;
+    _normalization_type    = normalization_type;
+    _l2_hyst_threshold     = l2_hyst_threshold;
+    _phase_type            = phase_type;
+
+    // Compute descriptor size. +1 takes into account of the bias
+    _descriptor_size = num_cells_per_block().area() * num_blocks_per_image(_detection_window_size).area() * _num_bins + 1;
+}
+
+Size2D HOGInfo::num_cells_per_block() const
+{
+    return Size2D(_block_size.width / _cell_size.width,
+                  _block_size.height / _cell_size.height);
+}
+
+Size2D HOGInfo::num_blocks_per_image(const Size2D &image_size) const
+{
+    return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
+                  ((image_size.height - _block_size.height) / _block_stride.height) + 1);
+}
+
+const Size2D &HOGInfo::cell_size() const
+{
+    return _cell_size;
+}
+
+const Size2D &HOGInfo::block_size() const
+{
+    return _block_size;
+}
+
+const Size2D &HOGInfo::detection_window_size() const
+{
+    return _detection_window_size;
+}
+
+const Size2D &HOGInfo::block_stride() const
+{
+    return _block_stride;
+}
+
+size_t HOGInfo::num_bins() const
+{
+    return _num_bins;
+}
+
+HOGNormType HOGInfo::normalization_type() const
+{
+    return _normalization_type;
+}
+
+float HOGInfo::l2_hyst_threshold() const
+{
+    return _l2_hyst_threshold;
+}
+
+PhaseType HOGInfo::phase_type() const
+{
+    return _phase_type;
+}
+
+size_t HOGInfo::descriptor_size() const
+{
+    return _descriptor_size;
+}
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
new file mode 100644
index 0000000000..ff903e9802
--- /dev/null
+++ b/src/core/Helpers.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <cstdint>
+
+using namespace arm_compute;
+
+Window arm_compute::calculate_max_window(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(!skip_border)
+    {
+        border_size = BorderSize(0);
+    }
+
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] + border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + border_size.top + ceil_to_multiple(shape[1] - border_size.top - border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
+
+Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps, BorderSize border_size)
+{
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // move the anchor to the start from the border
+                   anchor[0] - border_size.left,
+                   // move the anchor to include the right end border
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Include the border above the image
+                       anchor[1] - border_size.top,
+                       // Include the border below the image
+                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
+
+Window arm_compute::calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(skip_border)
+    {
+        border_size.top    = 0;
+        border_size.bottom = 0;
+    }
+    else
+    {
+        border_size.left  = 0;
+        border_size.right = 0;
+    }
+
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] - border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + shape[1] + border_size.bottom,
+                       1));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
new file mode 100644
index 0000000000..4ddc0fef1d
--- /dev/null
+++ b/src/core/IAccessWindow.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IAccessWindow.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+{
+    return compute_valid_region(window, input_valid_region, false, BorderSize(0));
+}
+
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    Coordinates  old_anchor(anchor);
+    TensorShape &shape = input_valid_region.shape;
+
+    if(!border_undefined)
+    {
+        border_size = BorderSize(0);
+    }
+
+    // Start of the valid region is equal to the start of the window. But it
+    // cannot be less than the start of the input's valid region plus the border
+    // size required by this kernel (if undefined).
+    // Additionally the valid region is shifted by the offset that is used by
+    // the kernel to write back output values.
+    anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+    }
+
+    // End of the valid region is equal to the start of the last write of the
+    // kernel plus the number of written elements. (This assumes that all
+    // written elements are valid). Nevertheless the end cannot be larger than
+    // the end of the input's valid region minus the border size.
+    // Note: not the end points of the region are stored but its size. Thus the
+    // old size is first converted into end points to compared against the
+    // execution window. Afterwards the new end points are converted back into
+    // a size of the region.
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+    }
+
+    // For higher dimensions use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+{
+    if(_info != nullptr)
+    {
+        _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
+    }
+}
+
+bool AccessWindowRectangle::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    const int min_y = window.y().start() * _scale_y + _y;
+    const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
+
+    // Adjust window start for Y dimension
+    if(min_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
+
+        if(min_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+
+            window.set(1, Window::Dimension(start / _scale_y, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, static_cast<int>(std::floor(-window.y().start() * _scale_y)) - _y);
+    }
+
+    // Adjust window end for Y dimension
+    if(max_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
+            window.set(1, Window::Dimension(window.y().start(), end / _scale_y, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    const int min_x = window.x().start() * _scale_x + _x;
+    const int max_x = (window.x().end() - window.x().step()) * _scale_x + _x + _width;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(min_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(min_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
+            window.set(0, Window::Dimension(start / _scale_x, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, static_cast<int>(std::floor(-window.x().start() * _scale_x)) - _x);
+    }
+
+    // Adjust window end for X dimension
+    if(max_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
+            window.set(0, Window::Dimension(window.x().start(), end / _scale_x, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowRectangle::update_padding_if_needed(const Window &window) const
+{
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    ARM_COMPUTE_ERROR_ON(window.x().step() * _scale_x == 0);
+    ARM_COMPUTE_ERROR_ON(window.y().step() * _scale_y == 0);
+
+    const int min_x = window.x().start() * _scale_x + _x;
+    const int max_x = (window.x().end() - window.x().step()) * _scale_x + _x + _width;
+    const int min_y = window.y().start() * _scale_y + _y;
+    const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -min_x);
+    padding.right  = std::max<int>(0, max_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/IDistribution.cpp b/src/core/IDistribution.cpp
new file mode 100644
index 0000000000..7d7186989e
--- /dev/null
+++ b/src/core/IDistribution.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IDistribution.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+void IDistribution::clear() const
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == buffer());
+    std::memset(buffer(), 0, size());
+}
diff --git a/src/core/IDistribution1D.cpp b/src/core/IDistribution1D.cpp
new file mode 100644
index 0000000000..f304289991
--- /dev/null
+++ b/src/core/IDistribution1D.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+IDistribution1D::IDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : _num_bins(num_bins), _offset(offset), _range(range)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_bins, "Invalid number of bins, it should be greater than 0");
+}
+
+size_t IDistribution1D::num_bins() const
+{
+    return _num_bins;
+}
+
+int32_t IDistribution1D::offset() const
+{
+    return _offset;
+}
+
+uint32_t IDistribution1D::range() const
+{
+    return _range;
+}
+
+uint32_t IDistribution1D::window() const
+{
+    return _range / _num_bins;
+}
+
+size_t IDistribution1D::size() const
+{
+    return _num_bins * sizeof(uint32_t);
+}
+
+void IDistribution1D::set_range(uint32_t range)
+{
+    _range = range;
+}
+
+size_t IDistribution1D::dimensions() const
+{
+    return 1;
+}
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
new file mode 100644
index 0000000000..6450a4fc2a
--- /dev/null
+++ b/src/core/IKernel.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IKernel.h"
+
+using namespace arm_compute;
+
+const Window &IKernel::window() const
+{
+    return _window;
+}
+
+IKernel::IKernel()
+    : _window()
+{
+    // Create an empty window to make sure the children classes set the window values themselves
+    _window.set(Window::DimX, Window::Dimension(0, 0, 1));
+    _window.set(Window::DimY, Window::Dimension(0, 0, 1));
+}
+
+bool IKernel::is_parallelisable() const
+{
+    return true;
+}
+
+BorderSize IKernel::border_size() const
+{
+    return BorderSize(0);
+}
+
+void IKernel::configure(const Window &window)
+{
+    _window = window;
+}
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
new file mode 100644
index 0000000000..0b29eca57b
--- /dev/null
+++ b/src/core/ITensor.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <limits>
+
+using namespace arm_compute;
+
+void ITensor::copy_from(const ITensor &src)
+{
+    if(&src == this)
+    {
+        return;
+    }
+
+    const ITensorInfo *src_info = src.info();
+    ITensorInfo       *dst_info = this->info();
+
+    ARM_COMPUTE_ERROR_ON(src_info->num_dimensions() > dst_info->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
+    ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
+
+    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    {
+        ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
+    }
+
+    // Copy information about valid region
+    dst_info->set_valid_region(src_info->valid_region());
+
+    Window win_src;
+    win_src.use_tensor_dimensions(src_info, Window::DimY);
+    Window win_dst;
+    win_dst.use_tensor_dimensions(dst_info, Window::DimY);
+
+    Iterator src_it(&src, win_src);
+    Iterator dst_it(this, win_dst);
+
+    const size_t line_size = src_info->num_channels() * src_info->element_size() * src_info->dimension(0);
+
+    execute_window_loop(win_src, [&](const Coordinates & id)
+    {
+        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
+    },
+    src_it, dst_it);
+}
+
+void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
+{
+    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
+
+    const DataType    dt       = this->info()->data_type();
+    const size_t      slices2D = this->info()->tensor_shape().total_size_upper(2);
+    const Strides     strides  = this->info()->strides_in_bytes();
+    const PaddingSize padding  = this->info()->padding();
+
+    // Set precision
+    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    {
+        int precision = io_fmt.precision;
+        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        {
+            precision = std::numeric_limits<float>().max_digits10;
+        }
+        s.precision(precision);
+    }
+
+    // Define region to print
+    size_t print_width  = 0;
+    size_t print_height = 0;
+    int    start_offset = 0;
+    switch(io_fmt.print_region)
+    {
+        case IOFormatInfo::PrintRegion::NoPadding:
+            print_width  = this->info()->dimension(0);
+            print_height = this->info()->dimension(1);
+            start_offset = this->info()->offset_first_element_in_bytes();
+            break;
+        case IOFormatInfo::PrintRegion::ValidRegion:
+            print_width  = this->info()->valid_region().shape.x();
+            print_height = this->info()->valid_region().shape.y();
+            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
+                                                                             this->info()->valid_region().anchor.y()));
+            break;
+        case IOFormatInfo::PrintRegion::Full:
+            print_width  = padding.left + this->info()->dimension(0) + padding.right;
+            print_height = padding.top + this->info()->dimension(1) + padding.bottom;
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            break;
+        default:
+            break;
+    }
+
+    // Set pointer to start
+    const uint8_t *ptr = this->buffer() + start_offset;
+
+    // Start printing
+    for(size_t i = 0; i < slices2D; ++i)
+    {
+        // Find max_width of elements in slice to align columns
+        int max_element_width = 0;
+        if(io_fmt.align_columns)
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                offset += strides[1];
+            }
+        }
+
+        // Print slice
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
+                offset += strides[1];
+                s << io_fmt.row_delim;
+            }
+            s << io_fmt.row_delim;
+        }
+    }
+}
+\ No newline at end of file
diff --git a/src/core/MultiImageInfo.cpp b/src/core/MultiImageInfo.cpp
new file mode 100644
index 0000000000..1e40a77c82
--- /dev/null
+++ b/src/core/MultiImageInfo.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/MultiImageInfo.h"
+
+using namespace arm_compute;
+
+MultiImageInfo::MultiImageInfo()
+    : _width(0), _height(0), _format(Format::UNKNOWN)
+{
+}
+
+void MultiImageInfo::init(unsigned int width, unsigned int height, Format format)
+{
+    _format = format;
+    _width  = width;
+    _height = height;
+}
+
+Format MultiImageInfo::format() const
+{
+    return _format;
+}
+
+unsigned int MultiImageInfo::width() const
+{
+    return _width;
+}
+
+unsigned int MultiImageInfo::height() const
+{
+    return _height;
+}
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..edb0a0f304
--- /dev/null
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t input1_val = vld1q_u8(input1.ptr());
+        const uint8x16_t input2_val = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vabdq_u8(input1_val, input2_val));
+    },
+    input1, input2, output);
+}
+
+inline int16x8x2_t vqabd2q_s16(const int16x8x2_t &v1, const int16x8x2_t &v2)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vqabsq_s16(vqsubq_s16(v1.val[0], v2.val[0])),
+            vqabsq_s16(vqsubq_s16(v1.val[1], v2.val[1]))
+        }
+    };
+
+    return res;
+}
+
+void abs_diff_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        int16x8x2_t input1_val = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8x2_t input2_val = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqabd2q_s16(input1_val, input2_val));
+    },
+    input1, input2, output);
+}
+
+void abs_diff_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t  input1_val = vld1q_u8(input1.ptr());
+        const int16x8x2_t input2_val =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t out_val =
+        {
+            {
+                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input1_val))), input2_val.val[0])),
+                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input1_val))), input2_val.val[1]))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out_val.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, out_val.val[1]);
+
+    },
+    input1, input2, output);
+}
+
+void abs_diff_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    abs_diff_U8_S16_S16(in2, in1, out, window);
+}
+} // namespace
+
+NEAbsoluteDifferenceKernel::NEAbsoluteDifferenceKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::U8);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "The output image can only be U8 if both input images are U8");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    const DataType input1_data_type = input1->info()->data_type();
+    const DataType input2_data_type = input2->info()->data_type();
+
+    if(input1_data_type == input2_data_type)
+    {
+        if(input1_data_type == DataType::U8)
+        {
+            _func = &abs_diff_U8_U8_U8;
+        }
+        else
+        {
+            _func = &abs_diff_S16_S16_S16;
+        }
+    }
+    else
+    {
+        if(input1_data_type == DataType::U8)
+        {
+            _func = &abs_diff_U8_S16_S16;
+        }
+        else
+        {
+            _func = &abs_diff_S16_U8_S16;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEAbsoluteDifferenceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    _func(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
new file mode 100644
index 0000000000..e5b933a781
--- /dev/null
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+/* Max S16 value used for saturation purposes. */
+const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
+{
+    const float16x8x2_t out =
+    {
+        {
+            vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))),
+            vcvtq_f16_u16(vmovl_u8(vget_high_u8(input)))
+        }
+    };
+
+    return out;
+}
+
+inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input)
+{
+    return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])),
+                       vmovn_u16(vcvtq_u16_f16(input.val[1])));
+}
+
+inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
+{
+    const float16x8x2_t res =
+    {
+        {
+            vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2),
+            vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2)
+        }
+    };
+
+    return res;
+}
+
+void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
+
+    const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
+    uint8x16x4_t       accum_buffer = vld4q_u8(accum_ptr);
+
+    const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
+    const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
+    const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
+    const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
+
+    float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
+    float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
+    float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
+    float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
+
+    f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
+    f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
+    f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
+    f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
+
+    accum_buffer = { {
+            convert_f16x8x2_to_u8x16(f16_accum_0),
+            convert_f16x8x2_to_u8x16(f16_accum_1),
+            convert_f16x8x2_to_u8x16(f16_accum_2),
+            convert_f16x8x2_to_u8x16(f16_accum_3)
+        }
+    };
+
+    vst4q_u8(accum_ptr, accum_buffer);
+}
+} // namespace fp16
+
+void NEAccumulateWeightedFP16Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    const float16x8_t scale_val  = vdupq_n_f16(1.f - _alpha);
+    const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
+    },
+    input, accum);
+}
+#endif
+
+namespace
+{
+inline void acc_v16_u8(const void *__restrict input, void *__restrict accum)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto in  = static_cast<const uint8_t *__restrict>(input);
+    const auto out = static_cast<int16_t *__restrict>(accum);
+
+    uint8x16_t ta1 = vld1q_u8(in);
+    int16x8_t  ta2 = vld1q_s16(out);
+    int16x8_t  ta3 = vld1q_s16(out + 8);
+
+    ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
+    ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
+
+    vst1q_s16(out, ta2);
+    vst1q_s16(out + 8, ta3);
+}
+
+inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
+{
+    const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
+    const uint16x8_t u16_output_hi  = vmovl_u8(vget_high_u8(input));
+
+    const float32x4x4_t res =
+    {
+        {
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
+        }
+    };
+
+    return res;
+}
+
+inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input)
+{
+    return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
+                                              vmovn_u32(vcvtq_u32_f32(input.val[1])))),
+                       vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
+                                              vmovn_u32(vcvtq_u32_f32(input.val[3])))));
+}
+
+inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
+{
+    vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
+    vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
+    vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
+    vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
+
+    vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
+    vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
+    vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
+    vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
+
+    return vector_output;
+}
+
+inline void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
+
+    const uint8x16_t input_buffer = vld1q_u8(input_ptr);
+    const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
+
+    const float32x4x4_t f32_input_0  = convert_u8x16_to_f32x4x4(input_buffer);
+    const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
+
+    const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
+
+    vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
+}
+
+void acc_sq_v16_u8(const void *__restrict input, uint32_t shift, void *__restrict accum)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    const auto input_buffer = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_buffer = static_cast<int16_t *__restrict>(accum);
+
+    const uint8x16_t ta1 = vld1q_u8(input_buffer);
+    uint16x8_t       ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
+    uint16x8_t       ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
+
+    const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
+
+    uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
+    uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
+
+    linput = vmulq_u16(linput, linput);
+    hinput = vmulq_u16(hinput, hinput);
+
+    linput = vqshlq_u16(linput, vector_shift);
+    hinput = vqshlq_u16(hinput, vector_shift);
+
+    ta2 = vqaddq_u16(ta2, linput);
+    ta3 = vqaddq_u16(ta3, hinput);
+
+    vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
+    vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
+}
+} // namespace
+
+void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_v16_u8(input.ptr(), accum.ptr());
+    },
+    input, accum);
+}
+
+NEAccumulateWeightedKernel::NEAccumulateWeightedKernel()
+    : _alpha(0.0f)
+{
+}
+
+void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+    _alpha = alpha;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateWeightedKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    const float32x4_t scale_val  = vdupq_n_f32(1.f - _alpha);
+    const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
+    },
+    input, accum);
+}
+
+NEAccumulateSquaredKernel::NEAccumulateSquaredKernel()
+    : _shift(0)
+{
+}
+
+void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    _shift = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateSquaredKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
+    },
+    input, accum);
+}
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
new file mode 100644
index 0000000000..a878078007
--- /dev/null
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <array>
+#include <cmath>
+#include <map>
+
+using namespace arm_compute;
+
+NEActivationLayerKernel::NEActivationLayerKernel()
+    : _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
+{
+}
+
+void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Activation functions : FP32
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
+    };
+
+    // Activation functions : QS8
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
+    };
+
+    _input    = input;
+    _output   = output;
+    _act_info = activation_info;
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = act_map_f32[activation_info.activation()];
+            break;
+        case DataType::QS8:
+            _func = act_map_qs8[activation_info.activation()];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+    const float32x4_t        a       = vdupq_n_f32(_act_info.a());
+    const float32x4_t        b       = vdupq_n_f32(_act_info.b());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        const float32x4x4_t in  = vld4q_f32(input_ptr);
+        float32x4x4_t       tmp = { {} };
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp =
+                {
+                    {
+                        vabsq_f32(in.val[0]),
+                        vabsq_f32(in.val[1]),
+                        vabsq_f32(in.val[2]),
+                        vabsq_f32(in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::LINEAR:
+                tmp =
+                {
+                    {
+                        vmlaq_f32(b, a, in.val[0]),
+                        vmlaq_f32(b, a, in.val[1]),
+                        vmlaq_f32(b, a, in.val[2]),
+                        vmlaq_f32(b, a, in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp =
+                {
+                    {
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))),
+                    }
+                };
+                break;
+            case ActivationFunction::RELU:
+                tmp =
+                {
+                    {
+                        vmaxq_f32(CONST_0, in.val[0]),
+                        vmaxq_f32(CONST_0, in.val[1]),
+                        vmaxq_f32(CONST_0, in.val[2]),
+                        vmaxq_f32(CONST_0, in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp =
+                {
+                    {
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))),
+                    }
+                };
+                break;
+            case ActivationFunction::SQRT:
+                tmp =
+                {
+                    {
+                        vinvq_f32(vinvsqrtq_f32(in.val[0])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[1])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[2])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::SQUARE:
+                tmp =
+                {
+                    {
+                        vmulq_f32(in.val[0], in.val[0]),
+                        vmulq_f32(in.val[1], in.val[1]),
+                        vmulq_f32(in.val[2], in.val[2]),
+                        vmulq_f32(in.val[3], in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::TANH:
+                tmp =
+                {
+                    {
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))),
+                    }
+                };
+                break;
+            default:
+                break;
+        }
+
+        vst4q_f32(output_ptr, tmp);
+    },
+    input, output);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+    int      fixed_point_position = _input->info()->fixed_point_position();
+
+    static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
+    const qint8x16_t        CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
+    const qint8x16_t        a       = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
+    const qint8x16_t        b       = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+        const qint8x16_t in  = vld1q_qs8(input_ptr);
+        qint8x16_t       tmp = {};
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp = vqabsq_qs8(in);
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+                break;
+            case ActivationFunction::LINEAR:
+                tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::RELU:
+                tmp = vmaxq_qs8(CONST_0, in);
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::SQRT:
+                tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+                break;
+            case ActivationFunction::SQUARE:
+                tmp = vqmulq_qs8(in, in, fixed_point_position);
+                break;
+            case ActivationFunction::TANH:
+                tmp = vtanhq_qs8(in, fixed_point_position);
+                break;
+            default:
+                break;
+        }
+
+        vst1q_qs8(output_ptr, tmp);
+    },
+    input, output);
+}
+
+void NEActivationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..a4fdad8a2a
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
+    },
+    input1, input2, output);
+}
+
+inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vaddq_s16(a.val[0], b.val[0]),
+            vaddq_s16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+
+inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    const float32x4x4_t res =
+    {
+        {
+            vaddq_f32(a.val[0], b.val[0]),
+            vaddq_f32(a.val[1], b.val[1]),
+            vaddq_f32(a.val[2], b.val[2]),
+            vaddq_f32(a.val[3], b.val[3])
+        }
+    };
+
+    return res;
+}
+
+inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vqaddq_s16(a.val[0], b.val[0]),
+            vqaddq_s16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+
+void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
+        const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
+
+        vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
+            }
+        };
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
+            }
+        };
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
+    },
+    input1, input2, output);
+}
+
+inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
+{
+    //Simply swap the two input buffers:
+    add_wrap_S16_U8_S16(input2, input1, output, window);
+}
+
+inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
+{
+    //Simply swap the two input buffers:
+    add_saturate_S16_U8_S16(input2, input1, output, window);
+}
+
+void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t a = vld1q_u8(input1.ptr());
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        const int16x8x2_t a_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
+            }
+        };
+
+        const int16x8x2_t b_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t a = vld1q_u8(input1.ptr());
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        const int16x8x2_t a_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
+            }
+        };
+
+        const int16x8x2_t b_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
+    },
+    input1, input2, output);
+}
+} // namespace
+
+NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+
+    static std::map<std::string, AddFunction *> map_function =
+    {
+        { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
+        { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
+        { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
+        { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
+        { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
+        { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
+        { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
+        { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
+        { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
+        { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
+        { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
+        { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
+    };
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string function_to_call("add_");
+    function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
+    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        _func = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called arithmetic addition with the wrong tensor data type");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEArithmeticAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..d3e62b069e
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
+        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
+        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        const int16x8x2_t ta3 =
+        {
+            {
+                vsubq_s16(ta1.val[0], ta2.val[0]),
+                vsubq_s16(ta1.val[1], ta2.val[1])
+            }
+        };
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        const int16x8x2_t ta3 =
+        {
+            {
+                vqsubq_s16(ta1.val[0], ta2.val[0]),
+                vqsubq_s16(ta1.val[1], ta2.val[1])
+            }
+        };
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+
+void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
+        const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
+
+        const float32x4x4_t ta3 =
+        {
+            {
+                vsubq_f32(ta1.val[0], ta2.val[0]),
+                vsubq_f32(ta1.val[1], ta2.val[1]),
+                vsubq_f32(ta1.val[2], ta2.val[2]),
+                vsubq_f32(ta1.val[3], ta2.val[3]),
+            }
+        };
+
+        vst4q_f32(reinterpret_cast<float *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
+
+        a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
+
+        a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
+
+        a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
+        a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
+
+        a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
+        a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+
+        const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+
+        const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
+                                          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
+                                          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+} // namespace
+
+NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+
+    static std::map<std::string, SubFunction *> map_function =
+    {
+        { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
+        { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
+        { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
+        { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
+        { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
+        { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
+        { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
+        { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
+        { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
+        { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
+        { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
+        { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
+    };
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string function_to_call("sub_");
+    function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
+    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        _func = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called subtract with the wrong image formats");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEArithmeticSubtractionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..9a216aecde
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+{
+}
+
+void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    int        fixed_point_position = in->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    qint8x16_t       mean_vec    = vdupq_n_qs8(0);
+    qint8x16_t       var_vec     = vdupq_n_qs8(0);
+    qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
+    qint8x16_t       beta_vec    = vdupq_n_qs8(0);
+    qint8x16_t       denominator = vdupq_n_qs8(0);
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_qs8(*(input_mean + id.z()));
+            var_vec   = vdupq_n_qs8(*(input_var + id.z()));
+            gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_qs8(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
+        const qint8x16_t x_bar     = vqmulq_qs8(numerator, denominator, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
+    },
+    input, output);
+}
+
+void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
+    float32x4_t       var_vec     = vdupq_n_f32(0.0);
+    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
+    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
+    float32x4_t       denominator = vdupq_n_f32(0.0);
+    const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
+            var_vec   = vdupq_n_f32(*(input_var + id.z()));
+            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
+    },
+    input, output);
+}
+
+void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _gamma   = gamma;
+    _beta    = beta;
+    _epsilon = epsilon;
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &batch_normalization_q8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            _func                             = &batch_normalization_fp32;
+            num_elems_processed_per_iteration = 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEBatchNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..e8e448e455
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_and_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, vandq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseAndKernel::NEBitwiseAndKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                             input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseAndKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_and_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..bf75592677
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restrict output)
+{
+    const uint8x16_t val0 = vld1q_u8(input);
+
+    vst1q_u8(output, vmvnq_u8(val0));
+}
+} // namespace
+
+NEBitwiseNotKernel::NEBitwiseNotKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseNotKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_not_U8_U8(input.ptr(), output.ptr());
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..f184be2f26
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, vorrq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseOrKernel::NEBitwiseOrKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                             input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseOrKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c4fb4c0d03
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, veorq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseXorKernel::NEBitwiseXorKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseXorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
new file mode 100644
index 0000000000..d7e6d73cd7
--- /dev/null
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void NEBox3x3FP16Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const float16x8x2_t top_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+
+        const float16x8x2_t mid_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+
+        const float16x8x2_t bot_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        float16x8_t out = top_f16.val[0];
+        //top mid
+        out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 1));
+        //top right
+        out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 2));
+        //mid left
+        out = vaddq_f16(out, mid_f16.val[0]);
+        //mid mid
+        out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 1));
+        //mid right
+        out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 2));
+        //bot left
+        out = vaddq_f16(out, bot_f16.val[0]);
+        //bot mid
+        out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 1));
+        //bot right
+        out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 2));
+
+        out = vmulq_f16(out, oneovernine);
+
+        vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(out)));
+    },
+    input, output);
+}
+#endif
+
+BorderSize NEBox3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+    constexpr int          rect_offset_xy                    = -1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration), output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEBox3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    const float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const int16x8x2_t top_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+        const int16x8x2_t mid_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+        const int16x8x2_t bot_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        int16x8_t out = top_s16.val[0];
+        //top mid
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1));
+        //top right
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+        //mid left
+        out = vaddq_s16(out, mid_s16.val[0]);
+        //mid mid
+        out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1));
+        //mid right
+        out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2));
+        //bot left
+        out = vaddq_s16(out, bot_s16.val[0]);
+        //bot mid
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1));
+        //bot right
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+        float32x4_t outfloathigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out)));
+        float32x4_t outfloatlow  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out)));
+
+        outfloathigh = vmulq_f32(outfloathigh, oneovernine);
+        outfloatlow  = vmulq_f32(outfloatlow, oneovernine);
+
+        out = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(outfloatlow)),
+                           vqmovn_s32(vcvtq_s32_f32(outfloathigh)));
+
+        vst1_u8(output.ptr(), vqmovun_s16(out));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
new file mode 100644
index 0000000000..85a2cd5855
--- /dev/null
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -0,0 +1,1856 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+constexpr int NO_EDGE = 0;
+constexpr int EDGE    = 255;
+constexpr int MAYBE   = 127;
+} // namespace
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
+    static const float32x4_t zero    = vdupq_n_f32(0.0f);
+    static const float32x4_t one     = vdupq_n_f32(1.0f);
+    static const float32x4_t two     = vdupq_n_f32(2.0f);
+    static const float32x4_t three   = vdupq_n_f32(3.0f);
+
+    // Score0: (1, 0)
+    const float32x4x2_t score0 =
+    {
+        vabsq_f32(gx.val[0]),
+        vabsq_f32(gx.val[1])
+    };
+
+    // Score2: ( 0, 1 )
+    const float32x4x2_t score2 =
+    {
+        vabsq_f32(gy.val[0]),
+        vabsq_f32(gy.val[1])
+    };
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float32x4x2_t score1 =
+    {
+        vmulq_f32(gy.val[0], const45),
+        vmulq_f32(gy.val[1], const45)
+    };
+
+    float32x4x2_t score3 = score1;
+
+    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
+    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
+    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
+    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
+
+    score1.val[0] = vabsq_f32(score1.val[0]);
+    score1.val[1] = vabsq_f32(score1.val[1]);
+    score3.val[0] = vabsq_f32(score3.val[0]);
+    score3.val[1] = vabsq_f32(score3.val[1]);
+
+    float32x4x2_t phase =
+    {
+        zero,
+        zero
+    };
+
+    float32x4x2_t old_score = score0;
+
+    // score1 > old_score?
+    uint32x4x2_t mask =
+    {
+        vcgtq_f32(score1.val[0], old_score.val[0]),
+        vcgtq_f32(score1.val[1], old_score.val[1])
+    };
+
+    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
+
+    // score2 > old_score?
+    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
+
+    // score3 > old_score?
+    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
+
+    // Convert from float32x4_t to uint8x8_t
+    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
+                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
+}
+
+inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
+    static const float16x8_t zero    = vdupq_n_f16(0.0f);
+    static const float16x8_t one     = vdupq_n_f16(1.0f);
+    static const float16x8_t two     = vdupq_n_f16(2.0f);
+    static const float16x8_t three   = vdupq_n_f16(3.0f);
+
+    // Score0: (1, 0)
+    const float16x8_t score0 = vabsq_f16(gx);
+
+    // Score2: ( 0, 1 )
+    const float16x8_t score2 = vabsq_f16(gy);
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float16x8_t score1 = vmulq_f16(gy, const45);
+    float16x8_t score3 = score1;
+
+    score1 = vfmaq_f16(score1, gx, const45);
+    score3 = vfmsq_f16(score3, gx, const45);
+
+    score1 = vabsq_f16(score1);
+    score3 = vabsq_f16(score3);
+
+    float16x8_t phase     = zero;
+    float16x8_t old_score = score0;
+
+    // score1 > old_score?
+    uint16x8_t mask = vcgtq_f16(score1, old_score);
+
+    phase     = vbslq_f16(mask, one, phase);
+    old_score = vbslq_f16(mask, score1, old_score);
+
+    // score2 > old_score?
+    mask = vcgtq_f16(score2, old_score);
+
+    phase     = vbslq_f16(mask, two, phase);
+    old_score = vbslq_f16(mask, score2, old_score);
+
+    // score3 > old_score?
+    mask = vcgtq_f16(score3, old_score);
+
+    phase = vbslq_f16(mask, three, phase);
+
+    // Convert from float16x8_t to uint8x8_t
+    return vmovn_u16(vcvtq_u16_f16(phase));
+}
+
+/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
+ *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
+}
+
+/** Computes the gradient phase if gradient_size = 7. The output is quantized.
+ *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        vcvtq_f32_s32(gx.val[0]),
+        vcvtq_f32_s32(gx.val[1])
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        vcvtq_f32_s32(gy.val[0]),
+        vcvtq_f32_s32(gy.val[1])
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
+                     vreinterpretq_u16_s16(vabsq_s16(gy)));
+}
+
+/** Computes the magnitude using the L1-norm type if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    const uint32x4x2_t gx_abs =
+    {
+        vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
+        vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
+    };
+
+    const uint32x4x2_t gy_abs =
+    {
+        vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
+        vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
+    };
+
+    const uint32x4x2_t out =
+    {
+        vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
+        vaddq_u32(gx_abs.val[1], gy_abs.val[1])
+    };
+
+    return out;
+}
+
+inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // x^2 ...
+    float32x4x2_t mag =
+    {
+        vmulq_f32(gx.val[0], gx.val[0]),
+        vmulq_f32(gx.val[1], gx.val[1])
+    };
+
+    // ... + y^2
+    mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
+    mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
+
+    // sqrt(...)
+    mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
+    mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
+
+    return mag;
+}
+
+inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
+{
+    // x^2 ...
+    float16x8_t mag = vmulq_f16(gx, gx);
+
+    // ... + y^2
+    mag = vfmaq_f16(mag, gy, gy);
+
+    // sqrt(...)
+    mag = vmulq_f16(vrsqrteq_f16(mag), mag);
+
+    return mag;
+}
+
+/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    /* Compute magnitude using L2 normalization */
+    const float16x8_t gx2 = vcvtq_f16_s16(gx);
+    const float16x8_t gy2 = vcvtq_f16_s16(gy);
+    const float16x8_t mag = mag_l2(gx2, gy2);
+
+    /* Store magnitude - Convert to uint16x8 */
+    return vcvtq_u16_f16(mag);
+}
+
+/** Computes the magnitude using L2-norm if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Compute magnitude using L2 normalization
+    float32x4x2_t gx2 =
+    {
+        vcvtq_f32_s32(gx.val[0]),
+        vcvtq_f32_s32(gx.val[1])
+    };
+
+    float32x4x2_t gy2 =
+    {
+        vcvtq_f32_s32(gy.val[0]),
+        vcvtq_f32_s32(gy.val[1])
+    };
+
+    const float32x4x2_t mag = mag_l2(gx2, gy2);
+    const uint32x4x2_t  mag32 =
+    {
+        vcvtq_u32_f32(mag.val[0]),
+        vcvtq_u32_f32(mag.val[1])
+    };
+
+    return mag32;
+}
+
+/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
+    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
+    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
+    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    const int16x8x4_t gx =
+    {
+        vld1q_s16(in1),
+        vld1q_s16(in1 + 8),
+        vld1q_s16(in1 + 16),
+        vld1q_s16(in1 + 24)
+    };
+
+    const int16x8x4_t gy =
+    {
+        vld1q_s16(in2),
+        vld1q_s16(in2 + 8),
+        vld1q_s16(in2 + 16),
+        vld1q_s16(in2 + 24)
+    };
+
+    // Compute and store phase
+    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
+    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
+    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
+    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
+
+    // Compute ans store magnitude using L1 normalization
+    vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
+    vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
+    vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
+    vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
+}
+
+/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
+    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
+    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
+    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    const int16x8x4_t gx =
+    {
+        vld1q_s16(in1),
+        vld1q_s16(in1 + 8),
+        vld1q_s16(in1 + 16),
+        vld1q_s16(in1 + 24)
+    };
+
+    const int16x8x4_t gy =
+    {
+        vld1q_s16(in2),
+        vld1q_s16(in2 + 8),
+        vld1q_s16(in2 + 16),
+        vld1q_s16(in2 + 24)
+    };
+
+    // Compute and store phase
+    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
+    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
+    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
+    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
+
+    // Compute and store magnitude using L2 normalization
+    vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
+    vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
+    vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
+    vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
+}
+
+/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
+    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
+    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
+    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            vld1q_s32(in1 + 0),
+            vld1q_s32(in1 + 4)
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            vld1q_s32(in1 + 8),
+            vld1q_s32(in1 + 12)
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            vld1q_s32(in2 + 0),
+            vld1q_s32(in2 + 4)
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            vld1q_s32(in2 + 8),
+            vld1q_s32(in2 + 12)
+        };
+
+        // Compute and store phase
+        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L1 normalization
+        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(out1 + 0, mag0.val[0]);
+        vst1q_u32(out1 + 4, mag0.val[1]);
+        vst1q_u32(out1 + 8, mag1.val[0]);
+        vst1q_u32(out1 + 12, mag1.val[1]);
+    }
+}
+
+/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
+    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
+    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
+    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            vld1q_s32(in1 + 0),
+            vld1q_s32(in1 + 4)
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            vld1q_s32(in1 + 8),
+            vld1q_s32(in1 + 12)
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            vld1q_s32(in2 + 0),
+            vld1q_s32(in2 + 4)
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            vld1q_s32(in2 + 8),
+            vld1q_s32(in2 + 12)
+        };
+
+        // Compute and store phase
+        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L2 normalization
+        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(out1 + 0, mag0.val[0]);
+        vst1q_u32(out1 + 4, mag0.val[1]);
+        vst1q_u32(out1 + 8, mag1.val[0]);
+        vst1q_u32(out1 + 12, mag1.val[1]);
+    }
+}
+
+inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
+{
+    // Phase for 4 pixel
+    const uint32x4_t pc32 = vmovl_u16(pc);
+
+    // Get magnitude for 4 pixel
+    uint32x4_t mc = vld1q_u32(in);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    // 0 degree
+    const uint32x4_t mk0_0 = vld1q_u32(in - 1);
+    const uint32x4_t mk0_1 = vld1q_u32(in + 1);
+    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+
+    // 45 degree
+    const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
+    const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
+    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+
+    // 90 degree
+    const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
+    const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
+    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+
+    // 135 degree
+    const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
+    const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
+    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u32(mask0, mask1);
+    mask2 = vorrq_u32(mask2, mask3);
+    mask0 = vorrq_u32(mask0, mask2);
+
+    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
+    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
+
+    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
+    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
+    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
+
+    return vmovn_u32(mc);
+}
+
+/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
+ *
+ * @param[in]  in         Pointer to source image. Data type supported U8
+ * @param[out] out        Pointer to destination image. Data type supported U8
+ * @param[in]  in_stride  Stride of the input image
+ * @param[in]  out_stride Stride of the output image
+ */
+void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
+{
+    // Look for MAYBE pixels in 8 directions
+    *out = EDGE;
+
+    // (-1, 0)
+    uint8_t pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (+1, 0)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+
+    in -= in_stride;
+    out -= out_stride;
+
+    // (-1, -1)
+    pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (0, -1)
+    pixel = *in;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *in = EDGE;
+
+        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
+    }
+
+    // (+1, -1)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+
+    in += in_stride * 2;
+    out += out_stride * 2;
+
+    // (-1, +1)
+    pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (0, +1)
+    pixel = *in;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *in = EDGE;
+
+        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
+    }
+
+    // (+1, +1)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+}
+} // namespace fp16
+
+void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(_gx->info()->data_type() == DataType::S16)
+    {
+        if(norm_type == 1)
+        {
+            _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
+        }
+        else
+        {
+            _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
+        }
+    }
+    else
+    {
+        if(norm_type == 1)
+        {
+            _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
+        }
+        else
+        {
+            _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+#endif
+
+namespace
+{
+inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
+    static const float32x4_t zero    = vdupq_n_f32(0.0f);
+    static const float32x4_t one     = vdupq_n_f32(1.0f);
+    static const float32x4_t two     = vdupq_n_f32(2.0f);
+    static const float32x4_t three   = vdupq_n_f32(3.0f);
+
+    // Score0: (1, 0)
+    const float32x4x2_t score0 =
+    {
+        {
+            vabsq_f32(gx.val[0]),
+            vabsq_f32(gx.val[1])
+        }
+    };
+
+    // Score2: ( 0, 1 )
+    const float32x4x2_t score2 =
+    {
+        {
+            vabsq_f32(gy.val[0]),
+            vabsq_f32(gy.val[1])
+        }
+    };
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float32x4x2_t score1 =
+    {
+        {
+            vmulq_f32(gy.val[0], const45),
+            vmulq_f32(gy.val[1], const45)
+        }
+    };
+
+    float32x4x2_t score3 = score1;
+
+    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
+    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
+    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
+    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
+
+    score1.val[0] = vabsq_f32(score1.val[0]);
+    score1.val[1] = vabsq_f32(score1.val[1]);
+    score3.val[0] = vabsq_f32(score3.val[0]);
+    score3.val[1] = vabsq_f32(score3.val[1]);
+
+    float32x4x2_t phase =
+    {
+        {
+            zero,
+            zero
+        }
+    };
+
+    float32x4x2_t old_score = score0;
+
+    // score1 > old_score?
+    uint32x4x2_t mask =
+    {
+        {
+            vcgtq_f32(score1.val[0], old_score.val[0]),
+            vcgtq_f32(score1.val[1], old_score.val[1])
+        }
+    };
+
+    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
+
+    // score2 > old_score?
+    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
+
+    // score3 > old_score?
+    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
+
+    // Convert from float32x4_t to uint8x8_t
+    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
+                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
+}
+
+/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
+ * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
+        }
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
+        }
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/* Computes the gradient phase if gradient_size = 7. The output is quantized.
+ * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        {
+            vcvtq_f32_s32(gx.val[0]),
+            vcvtq_f32_s32(gx.val[1])
+        }
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        {
+            vcvtq_f32_s32(gy.val[0]),
+            vcvtq_f32_s32(gy.val[1])
+        }
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
+                     vreinterpretq_u16_s16(vabsq_s16(gy)));
+}
+
+/* Computes the magnitude using the L1-norm type if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    const uint32x4x2_t gx_abs =
+    {
+        {
+            vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
+            vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
+        }
+    };
+
+    const uint32x4x2_t gy_abs =
+    {
+        {
+            vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
+            vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
+        }
+    };
+
+    const uint32x4x2_t output =
+    {
+        {
+            vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
+            vaddq_u32(gx_abs.val[1], gy_abs.val[1])
+        }
+    };
+
+    return output;
+}
+
+inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // x^2 ...
+    float32x4x2_t magnitude =
+    {
+        {
+            vmulq_f32(gx.val[0], gx.val[0]),
+            vmulq_f32(gx.val[1], gx.val[1])
+        }
+    };
+
+    // ... + y^2
+    magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]);
+    magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]);
+
+    // sqrt(...)
+    magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]);
+    magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]);
+
+    return magnitude;
+}
+
+/* Computes the magnitude using L2-norm if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    // Compute magnitude using L2 normalization
+    const float32x4x2_t gx2 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
+        }
+    };
+
+    const float32x4x2_t gy2 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
+        }
+    };
+
+    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
+
+    // Store magnitude - Convert to uint16x8
+    return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])),
+                        vmovn_u32(vcvtq_u32_f32(magnitude.val[1])));
+}
+
+/* Computes the magnitude using L2-norm if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Compute magnitude using L2 normalization
+    float32x4x2_t gx2 =
+    {
+        {
+            vcvtq_f32_s32(gx.val[0]),
+            vcvtq_f32_s32(gx.val[1])
+        }
+    };
+
+    float32x4x2_t gy2 =
+    {
+        {
+            vcvtq_f32_s32(gy.val[0]),
+            vcvtq_f32_s32(gy.val[1])
+        }
+    };
+
+    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
+    const uint32x4x2_t  mag32 =
+    {
+        {
+            vcvtq_u32_f32(magnitude.val[0]),
+            vcvtq_u32_f32(magnitude.val[1])
+        }
+    };
+
+    return mag32;
+}
+
+/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
+    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
+    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    const int16x8x4_t gx_val =
+    {
+        {
+            vld1q_s16(gx),
+            vld1q_s16(gx + 8),
+            vld1q_s16(gx + 16),
+            vld1q_s16(gx + 24)
+        }
+    };
+
+    const int16x8x4_t gy_val =
+    {
+        {
+            vld1q_s16(gy),
+            vld1q_s16(gy + 8),
+            vld1q_s16(gy + 16),
+            vld1q_s16(gy + 24)
+        }
+    };
+
+    // Compute and store phase
+    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
+
+    // Compute ans store magnitude using L1 normalization
+    vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3]));
+}
+
+/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
+    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
+    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    const int16x8x4_t gx_val =
+    {
+        {
+            vld1q_s16(gx),
+            vld1q_s16(gx + 8),
+            vld1q_s16(gx + 16),
+            vld1q_s16(gx + 24)
+        }
+    };
+
+    const int16x8x4_t gy_val =
+    {
+        {
+            vld1q_s16(gy),
+            vld1q_s16(gy + 8),
+            vld1q_s16(gy + 16),
+            vld1q_s16(gy + 24)
+        }
+    };
+
+    // Compute and store phase
+    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
+
+    // Compute and store magnitude using L2 normalization
+    vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3]));
+}
+
+/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type support U8
+ */
+void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
+    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
+    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
+    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            {
+                vld1q_s32(gx + 0),
+                vld1q_s32(gx + 4)
+            }
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            {
+                vld1q_s32(gx + 8),
+                vld1q_s32(gx + 12)
+            }
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            {
+                vld1q_s32(gy + 0),
+                vld1q_s32(gy + 4)
+            }
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            {
+                vld1q_s32(gy + 8),
+                vld1q_s32(gy + 12)
+            }
+        };
+
+        // Compute and store phase
+        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L1 normalization
+        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(magnitude + 0, mag0.val[0]);
+        vst1q_u32(magnitude + 4, mag0.val[1]);
+        vst1q_u32(magnitude + 8, mag1.val[0]);
+        vst1q_u32(magnitude + 12, mag1.val[1]);
+    }
+}
+
+/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
+    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
+    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
+    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            {
+                vld1q_s32(gx + 0),
+                vld1q_s32(gx + 4)
+            }
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            {
+                vld1q_s32(gx + 8),
+                vld1q_s32(gx + 12)
+            }
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            {
+                vld1q_s32(gy + 0),
+                vld1q_s32(gy + 4)
+            }
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            {
+                vld1q_s32(gy + 8),
+                vld1q_s32(gy + 12)
+            }
+        };
+
+        // Compute and store phase
+        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L2 normalization
+        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(magnitude + 0, mag0.val[0]);
+        vst1q_u32(magnitude + 4, mag0.val[1]);
+        vst1q_u32(magnitude + 8, mag1.val[0]);
+        vst1q_u32(magnitude + 12, mag1.val[1]);
+    }
+}
+
+/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5
+ *
+ * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U16
+ * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
+ * @param[out] output_ptr    Pointer to output image. Data type supported U8
+ * @param[in]  stride_mag    Stride of magnitude image
+ * @param[in]  lower_thr     Lower threshold used for the hysteresis
+ * @param[in]  upper_thr     Upper threshold used for the hysteresis
+ */
+void non_max_suppression_U16_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
+                                   const int32_t upper_thr)
+{
+    const auto magnitude = static_cast<const uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
+    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get magnitude and phase of the centre pixels
+    uint16x8_t mc = vld1q_u16(magnitude);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
+
+    // 0 degree
+    const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
+    const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
+    uint16x8_t       mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
+    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_0));
+    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_1));
+
+    // 45 degree
+    const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
+    const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
+    uint16x8_t       mask1  = vceqq_u16(pc16, vdupq_n_u16(1));
+    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_0));
+    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_1));
+
+    // 90 degree
+    const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
+    const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
+    uint16x8_t       mask2  = vceqq_u16(pc16, vdupq_n_u16(2));
+    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_0));
+    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_1));
+
+    // 135 degree
+    const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
+    const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
+    uint16x8_t       mask3   = vceqq_u16(pc16, vdupq_n_u16(3));
+    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_0));
+    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u16(mask0, mask1);
+    mask2 = vorrq_u16(mask2, mask3);
+    mask0 = vorrq_u16(mask0, mask2);
+
+    mc = vbslq_u16(mask0, mc, vdupq_n_u16(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr));
+    mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr)));
+
+    mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc);
+    mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc);
+    mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc);
+
+    vst1_u8(output, vmovn_u16(mc));
+}
+
+inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
+{
+    // Phase for 4 pixel
+    const uint32x4_t pc32 = vmovl_u16(pc);
+
+    // Get magnitude for 4 pixel
+    uint32x4_t mc = vld1q_u32(input);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    // 0 degree
+    const uint32x4_t mk0_0 = vld1q_u32(input - 1);
+    const uint32x4_t mk0_1 = vld1q_u32(input + 1);
+    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+
+    // 45 degree
+    const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
+    const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
+    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+
+    // 90 degree
+    const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
+    const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
+    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+
+    // 135 degree
+    const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
+    const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
+    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u32(mask0, mask1);
+    mask2 = vorrq_u32(mask2, mask3);
+    mask0 = vorrq_u32(mask0, mask2);
+
+    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
+    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
+
+    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
+    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
+    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
+
+    return vmovn_u32(mc);
+}
+
+/* Computes non-maxima suppression and hysteresis when the gradient_size = 7
+ *
+ * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U32
+ * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
+ * @param[out] output_ptr    Pointer to destination image. Data type supported U8
+ * @param[in]  stride_mag    Stride of magnitude image
+ * @param[in]  lower_thr     Lower threshold used for the hysteresis
+ * @param[in]  upper_thr     Upper threshold used for the hysteresis
+ */
+void non_max_suppression_U32_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
+                                   const int32_t upper_thr)
+{
+    const auto magnitude = static_cast<const uint32_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
+    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get phase for 8 pixel
+    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
+
+    // Compute non maxima suppression
+    const uint16x4x2_t res =
+    {
+        {
+            non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr),
+            non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr)
+        }
+    };
+
+    // Store result
+    vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1])));
+}
+
+/* Computes edge tracing when is called by edge_trace_U8_U8 recursively
+ *
+ * @param[in]  input         Pointer to source image. Data type supported U8
+ * @param[out] output        Pointer to destination image. Data type supported U8
+ * @param[in]  input_stride  Stride of the input image
+ * @param[in]  output_stride Stride of the output image
+ */
+void edge_trace_recursive_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
+{
+    // Look for MAYBE pixels in 8 directions
+    *output = EDGE;
+
+    // (-1, 0)
+    uint8_t pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (+1, 0)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+
+    input -= input_stride;
+    output -= output_stride;
+
+    // (-1, -1)
+    pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (0, -1)
+    pixel = *input;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *input = EDGE;
+
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+
+    // (+1, -1)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+
+    input += input_stride * 2;
+    output += output_stride * 2;
+
+    // (-1, +1)
+    pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (0, +1)
+    pixel = *input;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *input = EDGE;
+
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+
+    // (+1, +1)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+}
+
+/* Computes edge tracing
+ *
+ * @param[in]  input         Pointer to source image. Data type supported U8
+ * @param[out] output        Pointer to destination image. Data type supported U8
+ * @param[in]  input_stride  Stride of the input image
+ * @param[in]  output_stride Stride of the output image
+ */
+void edge_trace_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
+{
+    if(*input == NO_EDGE)
+    {
+        *output = NO_EDGE;
+    }
+    // Check if EDGE and not yet touched
+    else if((*input == EDGE) && (*output == NO_EDGE))
+    {
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+}
+} // namespace
+
+NEGradientKernel::NEGradientKernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(_gx->info()->data_type() == DataType::S16)
+    {
+        if(norm_type == 1)
+        {
+            _func = &mag_phase_l1norm_S16_S16_U16_U8;
+        }
+        else
+        {
+            _func = &mag_phase_l2norm_S16_S16_U16_U8;
+        }
+    }
+    else
+    {
+        if(norm_type == 1)
+        {
+            _func = &mag_phase_l1norm_S32_S32_U32_U8;
+        }
+        else
+        {
+            _func = &mag_phase_l2norm_S32_S32_U32_U8;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEGradientKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
+    },
+    gx, gy, magnitude, phase);
+}
+
+NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
+    : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
+{
+}
+
+BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output,
+                                              int32_t upper_thr, int32_t lower_thr, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
+
+    set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
+
+    set_format_if_unknown(*phase->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
+
+    _magnitude = magnitude;
+    _phase     = phase;
+    _output    = output;
+
+    switch(_magnitude->info()->data_type())
+    {
+        case DataType::U16:
+            _func = &non_max_suppression_U16_U8_U8;
+            break;
+        case DataType::U32:
+            _func = &non_max_suppression_U32_U8_U8;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+
+    // Set thresholds
+    _lower_thr = lower_thr;
+    _upper_thr = upper_thr;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 10;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mag_access, phase_access, output_access);
+
+    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEEdgeNonMaxSuppressionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+    Iterator output(_output, window);
+
+    const size_t input1_stride        = _magnitude->info()->strides_in_bytes()[1];
+    const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
+    },
+    magnitude, phase, output);
+}
+
+NEEdgeTraceKernel::NEEdgeTraceKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NEEdgeTraceKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+bool NEEdgeTraceKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    const ValidRegion &input_valid_region  = input->info()->valid_region();
+    const ValidRegion &output_valid_region = output->info()->valid_region();
+
+    // Reads can occur within the valid region of the input + border
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0] - border_size().left,
+                                    input_valid_region.anchor[1] - border_size().top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+
+    // Writes can occur within the valid region of the output + border
+    AccessWindowStatic output_access(output->info(),
+                                     output_valid_region.anchor[0] - border_size().left,
+                                     output_valid_region.anchor[1] - border_size().top,
+                                     output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right,
+                                     output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, _input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEEdgeTraceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride  = _input->info()->strides_in_bytes()[1];
+    const size_t output_stride = _output->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
new file mode 100644
index 0000000000..3147a698ad
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEChannelCombineKernel::NEChannelCombineKernel()
+    : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8),
+_is_parallelizable(true)
+{
+}
+
+void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+    ARM_COMPUTE_ERROR_ON(plane0 == output);
+    ARM_COMPUTE_ERROR_ON(plane1 == output);
+    ARM_COMPUTE_ERROR_ON(plane2 == output);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    if(plane3 != nullptr)
+    {
+        set_format_if_unknown(*plane3->info(), Format::U8);
+    }
+
+    set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+    if(plane3 != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+    }
+
+    const Format &output_format = output->info()->format();
+
+    if(output_format == Format::RGBA8888)
+    {
+        ARM_COMPUTE_ERROR_ON(plane3 == output);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+    }
+
+    _planes[0]    = plane0;
+    _planes[1]    = plane1;
+    _planes[2]    = plane2;
+    _planes[3]    = plane3;
+    _output       = output;
+    _output_multi = nullptr;
+
+    _num_elems_processed_per_iteration = 8;
+    _is_parallelizable                 = true;
+
+    switch(output_format)
+    {
+        case Format::RGB888:
+            _func = &NEChannelCombineKernel::combine_3C;
+            break;
+        case Format::RGBA8888:
+            _func = &NEChannelCombineKernel::combine_4C;
+            break;
+        case Format::UYVY422:
+            _x_subsampling[1]                  = 2;
+            _x_subsampling[2]                  = 2;
+            _num_elems_processed_per_iteration = 16;
+            _func                              = &NEChannelCombineKernel::combine_YUV_1p<true>;
+            break;
+        case Format::YUYV422:
+            _x_subsampling[1]                  = 2;
+            _x_subsampling[2]                  = 2;
+            _num_elems_processed_per_iteration = 16;
+            _func                              = &NEChannelCombineKernel::combine_YUV_1p<false>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
+    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
+
+    Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]);
+    AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]);
+    AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]);
+    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration);
+
+    update_window_and_padding(
+        win,
+        plane0_access,
+        plane1_access,
+        plane2_access,
+        plane3_access,
+        output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+                                                       plane1->info()->valid_region(),
+                                                       plane2->info()->valid_region());
+
+    if(plane3 != nullptr)
+    {
+        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = plane0->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
+
+            if(output->info()->format() == Format::IYUV)
+            {
+                set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+                ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
+            }
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+    _planes[0]                            = plane0;
+    _planes[1]                            = plane1;
+    _planes[2]                            = plane2;
+    _planes[3]                            = nullptr;
+    _output                               = nullptr;
+    _output_multi                         = output;
+    bool         has_two_planes           = false;
+    unsigned int num_elems_written_plane1 = 8;
+
+    _num_elems_processed_per_iteration = 8;
+    _is_parallelizable                 = true;
+
+    const Format &output_format = output->info()->format();
+
+    switch(output_format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+            _x_subsampling           = { { 1, 2, 2 } };
+            _y_subsampling           = { { 1, 2, 2 } };
+            _func                    = &NEChannelCombineKernel::combine_YUV_2p;
+            has_two_planes           = true;
+            num_elems_written_plane1 = 16;
+            break;
+        case Format::IYUV:
+            _is_parallelizable = false;
+            _x_subsampling     = { { 1, 2, 2 } };
+            _y_subsampling     = { { 1, 2, 2 } };
+            _func              = &NEChannelCombineKernel::combine_YUV_3p;
+            break;
+        case Format::YUV444:
+            _is_parallelizable = false;
+            _x_subsampling     = { { 1, 1, 1 } };
+            _y_subsampling     = { { 1, 1, 1 } };
+            _func              = &NEChannelCombineKernel::combine_YUV_3p;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
+
+    Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
+    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
+    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
+                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
+                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
+                              output_plane0_access,
+                              output_plane1_access,
+                              output_plane2_access);
+
+    ValidRegion plane0_valid_region = plane0->info()->valid_region();
+
+    ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+
+    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+bool NEChannelCombineKernel::is_parallelisable() const
+{
+    return _is_parallelizable;
+}
+
+void NEChannelCombineKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEChannelCombineKernel::combine_3C(const Window &win)
+{
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win);
+    Iterator p2(_planes[2], win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x3_t pixels =
+        {
+            {
+                vld1_u8(p0_ptr),
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr)
+            }
+        };
+
+        vst3_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_4C(const Window &win)
+{
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win);
+    Iterator p2(_planes[2], win);
+    Iterator p3(_planes[3], win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto p3_ptr  = static_cast<uint8_t *>(p3.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x4_t pixels =
+        {
+            {
+                vld1_u8(p0_ptr),
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr),
+                vld1_u8(p3_ptr)
+            }
+        };
+
+        vst4_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, p3, out);
+}
+
+template <bool is_uyvy>
+void NEChannelCombineKernel::combine_YUV_1p(const Window &win)
+{
+    // Create sub-sampled uv window and init uv planes
+    Window win_uv(win);
+    win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
+    win_uv.validate();
+
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win_uv);
+    Iterator p2(_planes[2], win_uv);
+    Iterator out(_output, win);
+
+    constexpr auto shift = is_uyvy ? 1 : 0;
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x2_t pixels_y = vld2_u8(p0_ptr);
+        const uint8x8x2_t pixels_uv =
+        {
+            {
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr)
+            }
+        };
+
+        uint8x8x4_t pixels{ {} };
+        pixels.val[0 + shift] = pixels_y.val[0];
+        pixels.val[1 - shift] = pixels_uv.val[0];
+        pixels.val[2 + shift] = pixels_y.val[1];
+        pixels.val[3 - shift] = pixels_uv.val[1];
+
+        vst4_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_YUV_2p(const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]);
+    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]);
+
+    // Copy first plane
+    copy_plane(win, 0);
+
+    // Update UV window
+    Window uv_win(win);
+    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
+    uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
+    uv_win.validate();
+
+    // Update output win
+    Window out_win(win);
+    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
+    out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
+    out_win.validate();
+
+    // Construct second plane
+    const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1;
+    Iterator  p1(_planes[1 + shift], uv_win);
+    Iterator  p2(_planes[2 - shift], uv_win);
+    Iterator  out(_output_multi->plane(1), out_win);
+
+    execute_window_loop(out_win, [&](const Coordinates & id)
+    {
+        const uint8x8x2_t pixels =
+        {
+            {
+                vld1_u8(p1.ptr()),
+                vld1_u8(p2.ptr())
+            }
+        };
+
+        vst2_u8(out.ptr(), pixels);
+    },
+    p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_YUV_3p(const Window &win)
+{
+    copy_plane(win, 0);
+    copy_plane(win, 1);
+    copy_plane(win, 2);
+}
+
+void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]);
+    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]);
+
+    // Update window
+    Window tmp_win(win);
+    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
+    tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
+    tmp_win.validate();
+
+    Iterator in(_planes[plane_id], tmp_win);
+    Iterator out(_output_multi->plane(plane_id), tmp_win);
+
+    execute_window_loop(tmp_win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        vst1_u8(out_ptr, vld1_u8(in_ptr));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
new file mode 100644
index 0000000000..ebc4b85c98
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEChannelExtractKernel::NEChannelExtractKernel()
+    : _func(nullptr), _lut_index(0)
+{
+}
+
+void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    unsigned int num_elems_processed_per_iteration = 8;
+
+    // Check format and channel
+    const Format       format      = input->info()->format();
+    const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1;
+    TensorShape        output_shape;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+            num_elems_processed_per_iteration = 16;
+            output_shape                      = input->info()->tensor_shape();
+
+            if(format == Format::RGB888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
+            }
+            else if(format == Format::RGBA8888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
+            }
+
+            switch(channel)
+            {
+                case Channel::R:
+                    _lut_index = 0;
+                    break;
+                case Channel::G:
+                    _lut_index = 1;
+                    break;
+                case Channel::B:
+                    _lut_index = 2;
+                    break;
+                case Channel::A:
+                    if(format == Format::RGBA8888)
+                    {
+                        _lut_index = 3;
+                        _func      = &NEChannelExtractKernel::extract_1C_from_4C_img;
+                        break;
+                    }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        case Format::YUYV422:
+        case Format::UYVY422:
+            output_shape = input->info()->tensor_shape();
+
+            if(channel != Channel::Y)
+            {
+                output_shape.set(0, output_shape[0] / 2);
+            }
+
+            switch(channel)
+            {
+                case Channel::Y:
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::YUYV422 == format) ? 0 : 1;
+                    break;
+                case Channel::U:
+                    num_elems_processed_per_iteration = 32;
+                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
+                    _lut_index                        = (Format::YUYV422 == format) ? 1 : 0;
+                    break;
+                case Channel::V:
+                    num_elems_processed_per_iteration = 32;
+                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
+                    _lut_index                        = (Format::YUYV422 == format) ? 3 : 2;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(input->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion input_valid_region = input->info()->valid_region();
+
+    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+            switch(channel)
+            {
+                case Channel::Y:
+                    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+                    break;
+                case Channel::U:
+                case Channel::V:
+                    set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported channel for selected format");
+            }
+            break;
+        case Format::YUV444:
+            set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    unsigned int num_elems_processed_per_iteration = 32;
+
+    const Format &format = input->info()->format();
+
+    switch(format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+            switch(channel)
+            {
+                case Channel::Y:
+                    _input = input->plane(0);
+                    _func  = &NEChannelExtractKernel::copy_plane;
+                    break;
+                case Channel::U:
+                    _input                            = input->plane(1);
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::NV12 == format) ? 0 : 1;
+                    break;
+                case Channel::V:
+                    _input                            = input->plane(1);
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::NV12 == format) ? 1 : 0;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            _func = &NEChannelExtractKernel::copy_plane;
+            switch(channel)
+            {
+                case Channel::Y:
+                    _input = input->plane(0);
+                    break;
+                case Channel::U:
+                    _input = input->plane(1);
+                    break;
+                case Channel::V:
+                    _input = input->plane(2);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    _output                    = output;
+    Window                 win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, _input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEChannelExtractKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEChannelExtractKernel::extract_1C_from_2C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld2q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_1C_from_3C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld3q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_1C_from_4C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld4q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_YUYV_uv(const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().step() % 2);
+
+    Window win_out(win);
+    win_out.set_dimension_step(Window::DimX, win.x().step() / 2);
+
+    Iterator in(_input, win);
+    Iterator out(_output, win_out);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld4q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::copy_plane(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        vst4_u8(out_ptr, vld4_u8(in_ptr));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
new file mode 100644
index 0000000000..6d370acff1
--- /dev/null
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+template <typename T>
+void NECol2ImKernel::run_col2im(const Window &window)
+{
+    const int output_stride_x = _output->info()->strides_in_bytes().x();
+    const int output_stride_y = _output->info()->strides_in_bytes().y();
+    const int output_stride_z = _output->info()->strides_in_bytes().z();
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(_input, window);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int hidx = id.y();
+        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x;
+
+        *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+    },
+    in, out);
+}
+
+NECol2ImKernel::NECol2ImKernel()
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_data_type_if_unknown(*output->info(), input->info()->data_type());
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, convolved_dims.first);
+    output_shape.set(1, convolved_dims.second);
+    output_shape.set(2, input->info()->tensor_shape()[0]);
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NECol2ImKernel::run_col2im<uint8_t>;
+            break;
+        case 2:
+            _func = &NECol2ImKernel::run_col2im<uint16_t>;
+            break;
+        case 4:
+            _func = &NECol2ImKernel::run_col2im<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NECol2ImKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
new file mode 100644
index 0000000000..cb5152e2b3
--- /dev/null
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/NEON/NEColorConvertHelper.inl"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEColorConvertKernel::NEColorConvertKernel()
+    : _input(nullptr), _output(nullptr), _func(nullptr)
+{
+}
+
+void NEColorConvertKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_rgbx_to_rgb;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_yuyv_to_rgb<false, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_yuyv_to_rgb<false, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_yuyv_to_rgb<true, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_yuyv_to_rgb<true, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGBA8888:
+                    _func                             = colorconvert_rgb_to_rgbx;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_nv12_to_rgb<true, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_nv12_to_rgb<true, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_nv12_to_rgb<false, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_nv12_to_rgb<false, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_iyuv_to_rgb<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_iyuv_to_rgb<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    unsigned int input_plane_count = 3;
+
+    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
+    {
+        input_plane_count = 2;
+    }
+
+    AccessWindowHorizontal input0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
+    AccessWindowRectangle  input2_access(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input0_access, input1_access, input2_access,
+                              output_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
+                                                           input->plane(1)->info()->valid_region());
+
+    if(input_plane_count == 3)
+    {
+        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
+    }
+
+    output_access.set_valid_region(win, intersect_region);
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+    set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0));
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_rgb_to_nv12<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_rgb_to_iyuv<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::YUV444:
+                    _func                             = colorconvert_rgb_to_yuv4<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_rgb_to_nv12<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_rgb_to_iyuv<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::YUV444:
+                    _func                             = colorconvert_rgb_to_yuv4<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_yuyv_to_nv12<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_yuyv_to_iyuv<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_yuyv_to_nv12<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_yuyv_to_iyuv<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    float sub_sampling = 1.f;
+
+    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+    {
+        win.set_dimension_step(Window::DimY, 2);
+        sub_sampling = 0.5f;
+    }
+
+    unsigned int output_plane_count = 3;
+
+    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
+    {
+        output_plane_count = 2;
+    }
+
+    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output0_access,
+                              output1_access,
+                              output2_access);
+
+    output0_access.set_valid_region(win, input->info()->valid_region());
+    output1_access.set_valid_region(win, input->info()->valid_region());
+    output2_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0));
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                    _func = colorconvert_nv12_to_iyuv<true>;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_nv12_to_yuv4<true>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                    _func = colorconvert_nv12_to_iyuv<false>;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_nv12_to_yuv4<false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func = colorconvert_iyuv_to_nv12;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_iyuv_to_yuv4;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+    constexpr float        input_sub_sampling                = 0.5f;
+    const float            output_sub_sampling               = output->info()->format() == Format::YUV444 ? 1.f : 0.5f;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->plane(0)->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    unsigned int input_plane_count = 3;
+
+    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
+    {
+        input_plane_count = 2;
+    }
+
+    unsigned int output_plane_count = 3;
+
+    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
+    {
+        output_plane_count = 2;
+    }
+
+    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
+    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->plane(0)->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowRectangle(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
+                              AccessWindowRectangle(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
+                              output0_access,
+                              output1_access,
+                              output2_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
+                                                           input->plane(1)->info()->valid_region());
+
+    if(input_plane_count == 3)
+    {
+        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
+    }
+
+    output0_access.set_valid_region(win, intersect_region);
+    output1_access.set_valid_region(win, intersect_region);
+    output2_access.set_valid_region(win, intersect_region);
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
new file mode 100644
index 0000000000..30e91ef253
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -0,0 +1,1618 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
+
+inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
+{
+    const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
+                                              vqmovn_s32(out2));
+    vst1q_s16(output, s16results);
+}
+
+inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
+{
+    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
+                                                        vqmovun_s32(out2)));
+    vst1_u8(output, u8results);
+}
+
+inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
+{
+    const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
+    const int16x8_t  s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
+    vst1q_s16(output, s16results);
+}
+
+inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
+{
+    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
+                                                        vqmovn_u32(out2)));
+    vst1_u8(output, u8results);
+}
+
+inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
+{
+    vst1q_s16(output, out);
+    vst1q_s16(output + 8, out2);
+}
+
+inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
+{
+    const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
+                                             vqmovun_s16(out2));
+    vst1q_u8(output, u8results);
+}
+
+inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
+{
+    const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
+                                             vqmovn_u16(out2));
+    vst1q_u8(output, u8results);
+}
+
+inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
+{
+    vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
+    vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
+}
+
+inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
+{
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x3_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row right value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+
+    // Calculate row left value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row right value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+}
+
+inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+
+    convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
+}
+
+inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x3_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+
+    // Calculate row left 2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row right +1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row right +2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+}
+
+inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
+    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x4_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1),
+            vget_high_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 3 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
+    // Calculate row right +3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
+
+    // Calculate row left 3 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row left 1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row right +1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+    // Calculate row right +2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
+    // Calculate row right +3 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
+}
+
+inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
+    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
+    const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
+    const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x4_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1),
+            vget_high_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 4 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
+    // Calculate row right +3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
+    // Calculate row right +4 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[2], mat8);
+
+    // Calculate row left 4 value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 3 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row left 2 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row left 1 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row middle value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+    // Calculate row right +1 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
+    // Calculate row right +2 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
+    // Calculate row right +3 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
+    // Calculate row right +4 value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[3], mat8);
+}
+} // namespace
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
+    : INESimpleKernel(), _scale(0), _convolution{ {} }
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NEConvolutionKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    _input  = input;
+    _output = output;
+
+    std::copy_n(conv, _convolution.size(), _convolution.begin());
+
+    if(scale == 0)
+    {
+        _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
+    }
+    else
+    {
+        _scale = scale;
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<3>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    // Load the matrix's coefficients into NEON registers:
+    const int16x4_t   mat00     = vld1_dup_s16(_convolution.data());
+    const int16x4_t   mat01     = vld1_dup_s16(_convolution.data() + 1);
+    const int16x4_t   mat02     = vld1_dup_s16(_convolution.data() + 2);
+    const int16x4_t   mat10     = vld1_dup_s16(_convolution.data() + 3);
+    const int16x4_t   mat11     = vld1_dup_s16(_convolution.data() + 4);
+    const int16x4_t   mat12     = vld1_dup_s16(_convolution.data() + 5);
+    const int16x4_t   mat20     = vld1_dup_s16(_convolution.data() + 6);
+    const int16x4_t   mat21     = vld1_dup_s16(_convolution.data() + 7);
+    const int16x4_t   mat22     = vld1_dup_s16(_convolution.data() + 8);
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
+    const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
+    const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top row:
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<5>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row5x1(out, out2, data_t2, _convolution.data());
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<7>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
+    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top3 row:
+        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t3, _convolution.data());
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
+
+        // Load 16 bytes from the low3 row:
+        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<9>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
+    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
+    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
+    const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top4 row:
+        const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t4, _convolution.data());
+
+        // Load 16 bytes from the top3 row:
+        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
+
+        // Load 16 bytes from the low3 row:
+        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
+
+        // Load 16 bytes from the low4 row:
+        const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_output->info()->format())
+    {
+        case Format::U8:
+            convolution<uint8_t>(window);
+            break;
+        case Format::S16:
+            convolution<int16_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+template class arm_compute::NEConvolutionKernel<3>;
+template class arm_compute::NEConvolutionKernel<5>;
+template class arm_compute::NEConvolutionKernel<7>;
+template class arm_compute::NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
+    : _conv_row{ { 0 } }, _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NESeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+    _input  = input;
+    _output = output;
+    std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
+    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    switch(_output->info()->data_type())
+    {
+        case DataType::U16:
+            convolve<uint16_t>(window);
+            break;
+        case DataType::S16:
+            convolve<int16_t>(window);
+            break;
+        case DataType::S32:
+            convolve<int32_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
+            break;
+    }
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
+        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
+        out            = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
+        out           = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
+        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template class arm_compute::NESeparableConvolutionHorKernel<5>;
+template class arm_compute::NESeparableConvolutionHorKernel<7>;
+template class arm_compute::NESeparableConvolutionHorKernel<9>;
+
+template <unsigned int matrix_size>
+NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
+    : _conv_col{ { 0 } }, _scale(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NESeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(scale == 0);
+
+    _input  = input;
+    _output = output;
+    std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
+    _scale = scale;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U16:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_u16<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_u16<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        case DataType::S16:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_s16<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_s16<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        case DataType::S32:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_s32<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_s32<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
+            break;
+    }
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint16x8_t out0 = vdupq_n_u16(0);
+        uint16x8_t out1 = vdupq_n_u16(0);
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
+            out0                  = vmlaq_n_u16(out0, data, _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
+            out1                  = vmlaq_n_u16(out1, data, _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
+            float32x4_t out0_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
+            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
+            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
+            store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
+
+            float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
+            float32x4_t out1_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
+            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
+            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
+            store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
+        }
+        else
+        {
+            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
+        }
+    },
+    in, out);
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int16x8_t out0 = vdupq_n_s16(0);
+        int16x8_t out1 = vdupq_n_s16(0);
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
+            out0                 = vmlaq_n_s16(out0, data, _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
+            out1                 = vmlaq_n_s16(out1, data, _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
+            float32x4_t out0_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
+            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
+            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
+            store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
+
+            float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
+            float32x4_t out1_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
+            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
+            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
+            store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
+        }
+        else
+        {
+            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
+        }
+    },
+    in, out);
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    const int32x4_t zero = vdupq_n_s32(0);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4x2_t out0 =
+        {
+            {
+                zero,
+                zero
+            }
+        };
+
+        int32x4x2_t out1 =
+        {
+            {
+                zero,
+                zero
+            }
+        };
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
+            out0.val[0]            = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
+            out0.val[1]            = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
+            out1.val[0]            = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
+            out1.val[1]            = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_odd  = vcvtq_f32_s32(out0.val[0]);
+            float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
+            out0_f32_odd              = vmulq_f32(out0_f32_odd, oneoverscale);
+            out0_f32_even             = vmulq_f32(out0_f32_even, oneoverscale);
+            out0.val[0]               = vcvtq_s32_f32(out0_f32_odd);
+            out0.val[1]               = vcvtq_s32_f32(out0_f32_even);
+
+            float32x4_t out1_f32_odd  = vcvtq_f32_s32(out1.val[0]);
+            float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
+            out1_f32_odd              = vmulq_f32(out1_f32_odd, oneoverscale);
+            out1_f32_even             = vmulq_f32(out1_f32_even, oneoverscale);
+            out1.val[0]               = vcvtq_s32_f32(out1_f32_odd);
+            out1.val[1]               = vcvtq_s32_f32(out1_f32_even);
+        }
+
+        const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
+        store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
+
+        const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
+        store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
+    },
+    in, out);
+}
+
+template class arm_compute::NESeparableConvolutionVertKernel<5>;
+template class arm_compute::NESeparableConvolutionVertKernel<7>;
+template class arm_compute::NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
+    : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
+{
+}
+
+BorderSize NEConvolutionRectangleKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
+    ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
+    ARM_COMPUTE_ERROR_ON(0 == scale);
+
+    _input       = input;
+    _output      = output;
+    _scale       = scale;
+    _border_size = BorderSize(height / 2, width / 2);
+
+    // Setup the convolution matrix
+    const uint32_t nr_elements = width * height;
+    _convolution.resize(nr_elements);
+    std::copy_n(conv, nr_elements, _convolution.begin());
+
+    // Set function index to help choose appropriate function in run()
+    _func_idx = get_index(height) * 4 + get_index(width);
+    ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win           = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
+    AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
+
+    INEKernel::configure(win);
+}
+
+void NEConvolutionRectangleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
+
+    // uint8_t function table
+    static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
+    {
+        {
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
+        }
+    };
+    // int16_t function table
+    static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
+    {
+        {
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
+        }
+    };
+
+    // Run appropriate function
+    switch(_output->info()->format())
+    {
+        case Format::U8:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
+            (this->*func_table_u8[_func_idx])(window);
+            break;
+        case Format::S16:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
+            (this->*func_table_s16[_func_idx])(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
+{
+    switch(val)
+    {
+        case 3:
+            return 0;
+        case 5:
+            return 1;
+        case 7:
+            return 2;
+        case 9:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported dimension size");
+            return 0;
+    }
+}
+
+template <typename OutputType, unsigned int rows, unsigned int cols>
+void NEConvolutionRectangleKernel::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    std::array<unsigned char *, rows> input_ptrs{ {} };
+    const int16_t    *conv       = _convolution.data();
+    const float32x4_t scale_val  = vdupq_n_f32(1.0f / _scale);
+    const int         k_row_half = rows / 2;
+    const int         k_col_half = cols / 2;
+
+    // Set row pointers
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Perform appropriate convolution
+        for(unsigned int r = 0; r < rows; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            if(3 == cols)
+            {
+                convolve_row3x1(out, out2, data, conv + r * cols);
+            }
+            else if(5 == cols)
+            {
+                convolve_row5x1(out, out2, data, conv + r * cols);
+            }
+            else if(7 == cols)
+            {
+                convolve_row7x1(out, out2, data, conv + r * cols);
+            }
+            else if(9 == cols)
+            {
+                convolve_row9x1(out, out2, data, conv + r * cols);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported number of columns");
+            }
+        }
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
new file mode 100644
index 0000000000..32789cbe33
--- /dev/null
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+
+using namespace arm_compute;
+
+NECumulativeDistributionKernel::NECumulativeDistributionKernel()
+    : _input(nullptr), _distribution(nullptr), _cumulative_sum(nullptr), _output(nullptr)
+{
+}
+
+bool NECumulativeDistributionKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, distribution, cumulative_sum, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != cumulative_sum->num_bins());
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != output->num_elements());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() != output->type());
+
+    _input          = input;
+    _distribution   = distribution;
+    _cumulative_sum = cumulative_sum;
+    _output         = output;
+
+    INEKernel::configure(calculate_max_window(*input->info()));
+}
+
+void NECumulativeDistributionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_cumulative_sum->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(_distribution->num_bins() < 256, "Distribution must have 256 bins");
+
+    // Calculate the cumulative distribution (summed histogram).
+    const uint32_t *hist           = _distribution->buffer();
+    uint32_t       *cumulative_sum = _cumulative_sum->buffer();
+    uint8_t        *output         = _output->buffer();
+
+    // Calculate cumulative distribution
+    std::partial_sum(hist, hist + _histogram_size, cumulative_sum);
+
+    // Get the number of pixels that have the lowest value in the input image
+    const uint32_t cd_min = *std::find_if(hist, hist + _histogram_size, [](const uint32_t &v)
+    {
+        return v > 0;
+    });
+    const uint32_t image_size = cumulative_sum[_histogram_size - 1];
+
+    ARM_COMPUTE_ERROR_ON(cd_min > image_size);
+
+    // Create mapping lookup table
+    if(image_size == cd_min)
+    {
+        std::iota(output, output + _histogram_size, 0);
+    }
+    else
+    {
+        const float diff = image_size - cd_min;
+
+        for(unsigned int x = 0; x < _histogram_size; ++x)
+        {
+            output[x] = lround((cumulative_sum[x] - cd_min) / diff * 255.0f);
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..902490ec38
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDepthConcatenateKernel::NEDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize NEDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthConcatenateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Offset output
+    const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
+    uint8_t           *output_ptr                        = _output->buffer() + offset_to_first_elements_in_bytes;
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
+
+        vst1q_f32(out_ptr, vld1q_f32(in_ptr));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
new file mode 100644
index 0000000000..56612a7703
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEDepthConvertKernel::NEDepthConvertKernel()
+    : _policy(), _shift(0)
+{
+}
+
+void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS8 ->  [out] F32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
+                                                                            && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
+                             "Only data_types supported [in] U16 ->  [out] U8, U32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] S16 ->  [out] U8, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
+                             "Only data_types supported [in] F32 ->  [out] QS8");
+
+    _policy = policy;
+    _shift  = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+}
+
+void NEDepthConvertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(nullptr == _input);
+    ARM_COMPUTE_ERROR_ON(nullptr == _output);
+    ARM_COMPUTE_ERROR_ON(_input == _output);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QS8:
+        {
+            const int fixed_point_position = _input->info()->fixed_point_position();
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    /* Up-conversion QS8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
+
+                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
+                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::U8:
+        {
+            const int16x8_t b = vdupq_n_s16(_shift);
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::S16:
+                {
+                    /* Up-conversion U8 -> S16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion U8 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::U16:
+                {
+                    /* Up-conversion U8 -> U16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
+                                vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
+                            }
+                        };
+
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::S16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion S16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                               vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::S32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion S16 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        const int32x4x4_t texels_s32 =
+                        {
+                            {
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::U16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion U16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::U32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion U16 -> U32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::F32:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    const int fixed_point_position = _output->info()->fixed_point_position();
+                    /* Down-conversion F32 -> QS8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels_f32 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+
+                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
new file mode 100644
index 0000000000..bf7e0972d5
--- /dev/null
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEDerivativeKernel::NEDerivativeKernel()
+    : _func(nullptr), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize NEDerivativeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEDerivativeKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    const bool run_der_x = output_x != nullptr;
+    const bool run_der_y = output_y != nullptr;
+
+    if(run_der_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(run_der_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration);
+    AccessWindowRectangle  in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+
+    if(run_der_x && run_der_y)
+    {
+        _func = &NEDerivativeKernel::derivative_xy;
+        update_window_and_padding(win, in_xy_access, out_x_access, out_y_access);
+        out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    }
+    else
+    {
+        if(run_der_x)
+        {
+            _func = &NEDerivativeKernel::derivative_x;
+            update_window_and_padding(win, in_x_access, out_x_access);
+            out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        }
+        else if(run_der_y)
+        {
+            _func = &NEDerivativeKernel::derivative_y;
+            update_window_and_padding(win, in_y_access, out_y_access);
+            out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+        }
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEDerivativeKernel::derivative_x(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_x(_output_x, window);
+
+    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load left and right data */
+        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
+        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
+
+        /* Store result of derivative along the X direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out1);
+    },
+    in, out_x);
+}
+
+void NEDerivativeKernel::derivative_y(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_y(_output_y, window);
+
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load top and bottom data */
+        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
+        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
+
+        /* Store result of derivative along the Y direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
+    },
+    in, out_y);
+}
+
+void NEDerivativeKernel::derivative_xy(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_x(_output_x, window);
+    Iterator out_y(_output_y, window);
+
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    /* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load top, bottom, left and right data */
+        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
+        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
+        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
+        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out2 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out3 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
+
+        /* Store result of derivative along the Y direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
+
+        /* Store result of derivative along the X direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out2);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out3);
+    },
+    in, out_x, out_y);
+}
+
+void NEDerivativeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
new file mode 100644
index 0000000000..867cf77c49
--- /dev/null
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+BorderSize NEDilateKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEDilateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        uint8_t         *in_ptr   = in.ptr() - 1;
+        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
+        const uint8x16_t mid_data = vld1q_u8(in_ptr);
+        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
+
+        uint8x8_t top_high_data = vget_high_u8(top_data);
+        uint8x8_t top_low_data  = vget_low_u8(top_data);
+
+        uint8x8_t mid_high_data = vget_high_u8(mid_data);
+        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
+
+        uint8x8_t bot_high_data = vget_high_u8(bot_data);
+        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
+
+        uint8x8_t p0, p1;
+
+        p0 = top_low_data;
+        p1 = vext_u8(top_low_data, top_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(top_low_data, top_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = mid_low_data;
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = bot_low_data;
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        vst1_u8(out.ptr(), p0);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
new file mode 100644
index 0000000000..effc50e7c0
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+// Internal load
+inline float32x4_t internal_vld1q(const float *in)
+{
+    return vld1q_f32(in);
+}
+inline qint8x16_t internal_vld1q(const qint8_t *in)
+{
+    return vld1q_qs8(in);
+}
+inline qint16x8_t internal_vld1q(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+
+// Internal store
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
+{
+    vst1q_qs8(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
+{
+    vst1_qs8(p, vqmovn_s16(v));
+}
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+// Internal vdup
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+inline qint8x16_t internal_vdupq_n(qint8_t v)
+{
+    return vdupq_n_qs8(v);
+}
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+
+// Internal vadd
+inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
+{
+    return vaddq_f32(x, y);
+}
+inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
+{
+    return vqaddq_qs8(x, y);
+}
+inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
+{
+    return vqaddq_qs16(x, y);
+}
+
+template <typename T1, typename T2, bool in_place>
+void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
+{
+    Iterator in(input, window);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+            const auto vb     = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr  = reinterpret_cast<const T1 *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+            const auto vb      = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in, out);
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output);
+    }
+    ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+
+    _func   = nullptr;
+    _bias   = bias;
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1));
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access, bias_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, bias_access);
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
+    }
+    INEKernel::configure(win);
+
+    // Set appropriate function
+    if(input->info()->data_type() == DataType::F32)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+    }
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _bias, window, _output);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000000..d6088981aa
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <unsigned int stridex>
+float32x4_t internal_vld1q(const float *in);
+
+template <>
+float32x4_t internal_vld1q<1>(const float *in)
+{
+    return vld1q_f32(in);
+}
+
+template <>
+float32x4_t internal_vld1q<2>(const float *in)
+{
+    const float32x4x2_t tmp = vld2q_f32(in);
+    return tmp.val[0];
+}
+
+template <>
+float32x4_t internal_vld1q<3>(const float *in)
+{
+    const float32x4x3_t tmp = vld3q_f32(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint8x8_t internal_vld1q(const qint8_t *in);
+
+template <>
+qint8x8_t internal_vld1q<1>(const qint8_t *in)
+{
+    return vld1_qs8(in);
+}
+
+template <>
+qint8x8_t internal_vld1q<2>(const qint8_t *in)
+{
+    const qint8x8x2_t tmp = vld2_s8(in);
+    return tmp.val[0];
+}
+
+template <>
+qint8x8_t internal_vld1q<3>(const qint8_t *in)
+{
+    const qint8x8x3_t tmp = vld3_s8(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+    return vld1q_s16(in);
+}
+
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline qint8x8_t internal_vdupq_n(qint8_t v)
+{
+    return vdup_n_qs8(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f32(x, y);
+}
+
+qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+{
+    return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmlaq_f32(x, y, z);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+    return vqmlal_qs8(x, y, z, fixed_point_position);
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_1x1
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          range_z              = window.z().end() - window.z().start();
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            /*
+                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
+            */
+            const uint8_t *input_ptr = in.ptr();
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                auto p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const qint8x8x3_t r =
+    {
+        {
+            vld1_dup_qs8(ptr),
+            vld1_dup_qs8(1 + ptr),
+            vld1_dup_qs8(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
+
+template <>
+inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const qint8x8x3_t vtop =
+    {
+        {
+            vld1_qs8(in_top),
+            vld1_qs8(in_top + 8),
+            vld1_qs8(in_top + 16)
+        }
+    };
+    const qint8x8x3_t vmid =
+    {
+        {
+            vld1_qs8(in_mid),
+            vld1_qs8(in_mid + 8),
+            vld1_qs8(in_mid + 16)
+        }
+    };
+    const qint8x8x3_t vlow =
+    {
+        {
+            vld1_qs8(in_low),
+            vld1_qs8(in_low + 8),
+            vld1_qs8(in_low + 16)
+        }
+    };
+    qint16x8x2_t out =
+    {
+        {
+            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
+            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
+        }
+    };
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+void store_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+    vst1q_qs16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vget_low_s16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
+}
+
+template <unsigned int stridex>
+void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+    vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x       = input->info()->strides_in_bytes().x();
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          num_planes_z         = window.z().end() - window.z().start();
+        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            /*
+                    Each thread executing this kernel computes one or more output's volume planes.
+
+                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
+                    the third thread [16,24] and the fourth thread [25,31].
+
+                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
+                    is that we setup the neon registers containing the kernerl's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
+
+                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
+                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
+                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
+            */
+
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            store_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            accumulate_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <typename T1, typename T2>
+inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
+    : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+{
+}
+
+BorderSize NEDirectConvolutionLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
+                             "Pad > 0 not supported for 1x1 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
+                             "Pad > 1 not supported for 3x3 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
+    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _conv_info   = conv_info;
+    _kernel_size = weights->info()->dimension(0);
+    _border_size = BorderSize(conv_pad_y, conv_pad_x);
+
+    Window win = calculate_max_window(*output->info());
+
+    switch(_kernel_size)
+    {
+        case 1:
+        {
+            _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
+            _num_elems_read_per_iteration    = conv_stride_x * _num_elems_written_per_iteration;
+
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        case 3:
+        {
+            if(input->info()->data_type() == DataType::F32)
+            {
+                _num_elems_read_per_iteration    = 12;
+                _num_elems_written_per_iteration = 16 >> conv_stride_x;
+            }
+            else
+            {
+                _num_elems_read_per_iteration    = 24;
+                _num_elems_written_per_iteration = 32 >> conv_stride_x;
+            }
+
+            // Calculate right and bottom border
+            const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+            const int          input_width   = input->info()->dimension(0);
+            const int          input_height  = input->info()->dimension(1);
+            const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+            const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+            _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+            _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+
+            // Create window and update padding
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+            AccessWindowStatic     weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, weights_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not implemented");
+            break;
+        }
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEDirectConvolutionLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    const int kernel_size = _weights->info()->dimension(0);
+
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        case 3:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+            break;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
new file mode 100644
index 0000000000..398503627c
--- /dev/null
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+BorderSize NEErodeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEErodeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        uint8_t         *in_ptr   = in.ptr() - 1;
+        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
+        const uint8x16_t mid_data = vld1q_u8(in_ptr);
+        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
+
+        uint8x8_t top_high_data = vget_high_u8(top_data);
+        uint8x8_t top_low_data  = vget_low_u8(top_data);
+
+        uint8x8_t mid_high_data = vget_high_u8(mid_data);
+        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
+
+        uint8x8_t bot_high_data = vget_high_u8(bot_data);
+        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
+
+        uint8x8_t p0, p1;
+
+        p0 = top_low_data;
+        p1 = vext_u8(top_low_data, top_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(top_low_data, top_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = mid_low_data;
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = bot_low_data;
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        vst1_u8(out.ptr(), p0);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
new file mode 100644
index 0000000000..9e8b5526a1
--- /dev/null
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstddef>
+#include <limits>
+
+using namespace arm_compute;
+
+NEFastCornersKernel::NEFastCornersKernel()
+    : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
+{
+}
+
+namespace
+{
+constexpr size_t PERMUTATIONS = 16;
+constexpr size_t PERM_SIZE    = 16;
+
+inline uint8x8x2_t create_permutation_index(size_t k)
+{
+    ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
+
+    static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE]
+    {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
+        { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
+        { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
+        { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
+        { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
+        { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
+        { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
+        { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
+        { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
+        { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
+        { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
+        { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
+        { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
+        { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
+        { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
+        { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
+
+    };
+
+    const uint8x8x2_t index =
+    {
+        {
+            vld1_u8(permutations_table[k]),
+            vld1_u8(permutations_table[k] + 8)
+        }
+    };
+
+    return index;
+}
+
+inline uint8x8x4_t create_circle_index_register()
+{
+    /*
+        This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+        Where . is an irrelevant texel value
+
+        We want to retrieve all texels [0,F]
+
+        The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
+
+        The first table holds the top 4 rows of texels
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+
+        The second table the bottom 3 rows of texels
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+    */
+    static const uint8_t top_right[8] =
+    {
+        /* The register r.val[0] will be used to retrieve these texels:
+        . . . 0 1 . . .
+        . . . . . 2 . .
+        . . . . . . 3 .
+        . . . . . . 4 .
+        */
+        3 /* top table, first row, elem 4, value 0 in the diagram above */,
+        4 /* top table, first row, elem 5, value 1 in the diagram above */,
+        13 /* top table, second row, elem 6, value 2 in the diagram above */,
+        22 /* top table, third row, elem 7, value 3 in the diagram above*/,
+        30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
+        255,
+        255,
+        255
+    };
+
+    static const uint8_t bottom_right[8] =
+    {
+        /* The register r.val[1] will be used to retrieve these texels:
+        . . . . . . 5 .
+        . . . . . 6 . .
+        . . . . 7 . . .
+        */
+        255,
+        255,
+        255,
+        255,
+        255,
+        6 /* low table, first row, elem 7, value 5 in the diagram above*/,
+        13 /* low table, second row, elem 6, value 6 in the diagram above*/,
+        20 /* low table, third row, elem 5, value 7 in the diagram above*/
+    };
+
+    static const uint8_t top_left[8] =
+    {
+        /* The register r.val[2] will be used to retrieve these texels:
+        . . F . . . . .
+        . E . . . . . .
+        D . . . . . . .
+        C . . . . . . .
+        */
+        255,
+        255,
+        255,
+        255,
+        24 /* top table, fourth row, elem 1, value C in the diagram above */,
+        16 /* top table, third row, elem 1, value D in the diagram above*/,
+        9 /* top table, second row, elem 2, value E in the diagram above*/,
+        2 /* top table, first row, elem 3, value F in the diagram above*/
+    };
+
+    static const uint8_t bottom_left[8] =
+    {
+        /* The register r.val[3] will be used to retrieve these texels:
+        B . . . . . . .
+        . A . . . . . .
+        . . 9 8 . . . .
+        */
+        19 /* low table, third row, elem 4, value 8 in the diagram above */,
+        18 /* low table, third row, elem 3, value 9 in the diagram above */,
+        9 /* low table, second row, elem 2, value A in the diagram above */,
+        0 /* low table, first row, elem 1, value B in the diagram above */,
+        255,
+        255,
+        255,
+        255
+    };
+
+    const uint8x8x4_t reg =
+    {
+        {
+            vld1_u8(top_right),
+            vld1_u8(bottom_right),
+            vld1_u8(top_left),
+            vld1_u8(bottom_left)
+        }
+    };
+
+    return reg;
+}
+
+inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo)
+{
+    /*
+        This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'.
+        The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register().
+        tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P
+    */
+    return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
+                       vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
+}
+
+inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle)
+{
+    /*
+        This function stores the 9 texels of a give permutation X in the neon register 'texels'
+
+        'tbl_circle' is a LUT with the texels 0 to F
+
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+        'permutation_index' is one of the permutations below:
+
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8},
+        { F, 0, 1, 2, 3, 4, 5, 6, 7},
+        { E, F, 0, 1, 2, 3, 4, 5, 6},
+        { D, E, F, 0, 1, 2, 3, 4, 5},
+        { C, D, E, F, 0, 1, 2, 3, 4},
+        { B, C, D, E, F, 0, 1, 2, 3},
+        { A, B, C, D, E, F, 0, 1, 2},
+        { 9, A, B, C, D, E, F, 0, 1},
+        { 8, 9, A, B, C, D, E, F, 0},
+        { 7, 8, 9, A, B, C, D, E, F},
+        { 6, 7, 8, 9, A, B, C, D, E},
+        { 5, 6, 7, 8, 9, A, B, C, D},
+        { 4, 5, 6, 7, 8, 9, A, B, C},
+        { 3, 4, 5, 6, 7, 8, 9, A, B},
+        { 2, 3, 4, 5, 6, 7, 8, 9, A},
+        { 1, 2, 3, 4, 5, 6, 7, 8, 9},
+    */
+    static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes
+
+    return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
+                       vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
+}
+
+inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg)
+{
+    const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
+
+    return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
+}
+
+inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl)
+{
+    const uint8x16_t res_lt    = vcltq_u8(permutation, pl);
+    const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
+    const uint64_t   t3        = vgetq_lane_u64(u64res_lt, 0);
+    const uint64_t   t4        = vgetq_lane_u64(u64res_lt, 1);
+
+    return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
+}
+
+inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl)
+{
+    return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
+}
+
+inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS])
+{
+    /*
+        This function determines whether the point 'p' is a corner.
+    */
+    uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
+    uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
+
+    bool corner_detected = false;
+
+    for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
+    {
+        const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
+        corner_detected            = is_permutation_corner(pe_texels, pg, pl);
+    }
+
+    return corner_detected;
+}
+
+inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r)
+{
+    /*
+        This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
+        circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above.
+    */
+
+    //Load the texels in the window [(x-3,y-3),(x+3,y+3)].
+    //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo.
+    //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3.
+    const uint8x8x4_t tbl_window_hi =
+    {
+        {
+            vld1_u8(buffer[0] + in_offset),
+            vld1_u8(buffer[1] + in_offset),
+            vld1_u8(buffer[2] + in_offset),
+            vld1_u8(buffer[3] + in_offset)
+        }
+    };
+
+    const uint8x8x3_t tbl_window_lo =
+    {
+        {
+            vld1_u8(buffer[4] + in_offset),
+            vld1_u8(buffer[5] + in_offset),
+            vld1_u8(buffer[6] + in_offset)
+        }
+    };
+
+    const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
+
+    const uint8x8x2_t tbl_circle_texels =
+    {
+        {
+            vget_low_u8(circle_texels),
+            vget_high_u8(circle_texels)
+        }
+    };
+
+    return tbl_circle_texels;
+}
+
+inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS])
+{
+    uint8_t b = 255;
+    uint8_t a = tolerance;
+
+    while(b - a > 1)
+    {
+        const uint16_t ab = a + b;
+        const uint8_t  c  = ab >> 1;
+
+        if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
+        {
+            a = c;
+        }
+        else
+        {
+            b = c;
+        }
+    }
+
+    return a;
+}
+} // namespace
+
+BorderSize NEFastCornersKernel::border_size() const
+{
+    return BorderSize(3);
+}
+
+void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented");
+
+    _input               = input;
+    _output              = output;
+    _threshold           = threshold;
+    _non_max_suppression = non_max_suppression;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEFastCornersKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
+    /*
+        We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t.
+        The three lines below setup the neon index registers to get these texels out from the table
+    */
+    const uint8x8x4_t circle_index_r = create_circle_index_register();
+    /*
+        We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation.
+    */
+    for(size_t k = 0; k < PERMUTATIONS; ++k)
+    {
+        perm_index[k] = create_permutation_index(k);
+    }
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const uint8_t *const __restrict in_row[7] =
+    {
+        _input->ptr_to_element(Coordinates(-3, -3)),
+        _input->ptr_to_element(Coordinates(-3, -2)),
+        _input->ptr_to_element(Coordinates(-3, -1)),
+        _input->ptr_to_element(Coordinates(-3, 0)),
+        _input->ptr_to_element(Coordinates(-3, 1)),
+        _input->ptr_to_element(Coordinates(-3, 2)),
+        _input->ptr_to_element(Coordinates(-3, 3))
+    };
+
+    auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b)
+    {
+        const bool p_is_in_ab = (a <= p) && (p <= b);
+        const bool q_is_in_ab = (a <= q) && (q <= b);
+        return p_is_in_ab && q_is_in_ab;
+    };
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t  in_offset = in.offset();
+        const uint8_t p0        = *in.ptr();
+        const uint8_t b         = std::min(p0 + _threshold, 255);
+        const uint8_t a         = std::max(p0 - _threshold, 0);
+        uint8_t       score     = 0;
+        /*
+            Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations
+
+            pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner.
+        */
+        const uint8_t p1 = (in_offset + in_row[0])[3];
+        const uint8_t p9 = (in_offset + in_row[6])[3];
+
+        if(!is_rejected(p1, p9, a, b))
+        {
+            /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */
+            const uint8_t p5  = (in_offset + in_row[3])[6];
+            const uint8_t p13 = (in_offset + in_row[3])[0];
+
+            if(!is_rejected(p5, p13, a, b))
+            {
+                /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
+                const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
+
+                if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data()))
+                {
+                    if(_non_max_suppression)
+                    {
+                        score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data());
+                    }
+                    else
+                    {
+                        score = 1;
+                    }
+                }
+            }
+        }
+
+        *out.ptr() = score;
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
new file mode 100644
index 0000000000..7e7e1c2501
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+NEFillArrayKernel::NEFillArrayKernel()
+    : _input(nullptr), _output(nullptr), _threshold(0)
+{
+}
+
+void NEFillArrayKernel::configure(const IImage *input, uint8_t threshold, IKeyPointArray *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    _input     = input;
+    _output    = output;
+    _threshold = threshold;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_read_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+bool NEFillArrayKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEFillArrayKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8_t value = *input.ptr();
+
+        if(value >= _threshold)
+        {
+            KeyPoint p;
+            p.x               = id.x();
+            p.y               = id.y();
+            p.strength        = value;
+            p.tracking_status = 1;
+
+            if(!_output->push_back(p))
+            {
+                return; //Overflowed: stop trying to add more points
+            }
+        }
+    },
+    input);
+}
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
new file mode 100644
index 0000000000..bd99242b11
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEFillBorderKernel::NEFillBorderKernel()
+    : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(0)
+{
+}
+
+void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+
+    _tensor                = tensor;
+    _border_size           = border_size;
+    _mode                  = border_mode;
+    _constant_border_value = constant_border_value;
+
+    _border_size.limit(tensor->info()->padding());
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    INEKernel::configure(win);
+}
+
+void NEFillBorderKernel::run(const Window &window)
+{
+    // If there is no border: early exit
+    if(_border_size.empty())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_mode)
+    {
+        case BorderMode::CONSTANT:
+        {
+            switch(_tensor->info()->data_type())
+            {
+                case DataType::U8:
+                    fill_constant_value_single_channel<uint8_t>(window);
+                    break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_constant_value_single_channel<int8_t>(window);
+                    break;
+                case DataType::U16:
+                    fill_constant_value_single_channel<uint16_t>(window);
+                    break;
+                case DataType::S16:
+                case DataType::QS16:
+                    fill_constant_value_single_channel<int16_t>(window);
+                    break;
+                case DataType::U32:
+                    fill_constant_value_single_channel<uint32_t>(window);
+                    break;
+                case DataType::S32:
+                    fill_constant_value_single_channel<int32_t>(window);
+                    break;
+                case DataType::F32:
+                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                    fill_constant_value_single_channel<float>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not handled");
+            }
+            break;
+        }
+        case BorderMode::REPLICATE:
+        {
+            switch(_tensor->info()->data_type())
+            {
+                case DataType::U8:
+                    fill_replicate_single_channel<uint8_t>(window);
+                    break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_replicate_single_channel<int8_t>(window);
+                    break;
+                case DataType::U16:
+                    fill_replicate_single_channel<uint16_t>(window);
+                    break;
+                case DataType::S16:
+                case DataType::QS16:
+                    fill_replicate_single_channel<int16_t>(window);
+                    break;
+                case DataType::U32:
+                    fill_replicate_single_channel<uint32_t>(window);
+                    break;
+                case DataType::S32:
+                    fill_replicate_single_channel<int32_t>(window);
+                    break;
+                case DataType::F32:
+                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                    fill_replicate_single_channel<float>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not handled");
+            }
+            break;
+        }
+        case BorderMode::UNDEFINED:
+            break; // Nothing to do here
+        default:
+            ARM_COMPUTE_ERROR("Unknown border mode");
+    }
+}
+
+template <typename T>
+void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
+{
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const size_t &width              = _tensor->info()->valid_region().shape[0];
+    const size_t &height             = _tensor->info()->valid_region().shape[1];
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
+        const auto left_val  = *reinterpret_cast<T *>(vertical_it.ptr());
+        const auto right_val = *(reinterpret_cast<T *>(vertical_it.ptr()) + width - 1);
+
+        // Fill left and right borders
+        std::fill_n(row_start - _border_size.left, _border_size.left, left_val);
+        std::fill_n(row_start + width, _border_size.right, right_val);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(_tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto first_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset());
+
+        // Top border
+        for(int i = -_border_size.top; i < 0; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Copy top rows including left/right borders
+            std::copy_n(first_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+        }
+
+        const auto last_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + (height - 1) * _tensor->info()->strides_in_bytes()[1]);
+
+        // Bottom border
+        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Copy bottom rows including left/right borders
+            std::copy_n(last_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+        }
+    },
+    plane_it);
+}
+
+template <typename T>
+void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
+{
+    T constant_border_value;
+    _constant_border_value.get(constant_border_value);
+
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const size_t &width              = _tensor->info()->valid_region().shape[0];
+    const size_t &height             = _tensor->info()->valid_region().shape[1];
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
+
+        // Fill left and right borders
+        std::fill_n(row_start - _border_size.left, _border_size.left, constant_border_value);
+        std::fill_n(row_start + width, _border_size.right, constant_border_value);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(_tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Top border
+        for(int i = -_border_size.top; i < 0; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Fill top rows including left/right borders
+            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+        }
+
+        // Bottom border
+        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Fill bottom rows including left/right borders
+            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+        }
+    },
+    plane_it);
+}
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
new file mode 100644
index 0000000000..699a5d9299
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEFillInnerBorderKernel::NEFillInnerBorderKernel()
+    : _tensor(nullptr), _border_size(0), _constant_border_value(0)
+{
+}
+
+void NEFillInnerBorderKernel::configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::S32, DataType::F32);
+
+    _tensor                = input;
+    _border_size           = border_size;
+    _constant_border_value = constant_border_value;
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    INEKernel::configure(win);
+}
+
+void NEFillInnerBorderKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // If there is no border: early exit
+    if(_border_size.empty())
+    {
+        return;
+    }
+
+    switch(_tensor->info()->data_type())
+    {
+        case DataType::U8:
+            fill_value_single_channel<uint8_t>(window);
+            break;
+        case DataType::S16:
+            fill_value_single_channel<int16_t>(window);
+            break;
+        case DataType::S32:
+            fill_value_single_channel<int32_t>(window);
+            break;
+        case DataType::F32:
+            static_assert(sizeof(float) == 4, "Float must be 32 bit");
+            fill_value_single_channel<float>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not handled");
+            break;
+    }
+}
+
+template <typename T>
+void NEFillInnerBorderKernel::fill_value_single_channel(const Window &window)
+{
+    const size_t stride = _tensor->info()->strides_in_bytes()[1];
+    const size_t width  = _tensor->info()->dimension(0);
+    const size_t height = _tensor->info()->dimension(1);
+
+    T constant_border_value;
+    _constant_border_value.get(constant_border_value);
+
+    // Left and right border
+    // All X values are set at once
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()), _border_size.left, constant_border_value);
+        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    // All values are set at once
+    Iterator horizontal_it(_tensor, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        for(size_t i = 0; i < _border_size.top; ++i)
+        {
+            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + i * stride), width, constant_border_value);
+        }
+
+        for(size_t i = 0; i < _border_size.bottom; ++i)
+        {
+            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + (height - i - 1) * stride), width, constant_border_value);
+        }
+    },
+    horizontal_it);
+}
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..3ff8b7b201
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 32);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const uint8x8x4_t data =
+        {
+            {
+                vld1_u8(in.ptr() + 0 * in_stride),
+                vld1_u8(in.ptr() + 1 * in_stride),
+                vld1_u8(in.ptr() + 2 * in_stride),
+                vld1_u8(in.ptr() + 3 * in_stride),
+            }
+        };
+        vst4_u8(out.ptr(), data);
+    },
+    in, out);
+}
+
+void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 16);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint16x4x4_t data =
+        {
+            {
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+            }
+        };
+        vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
+    },
+    in, out);
+}
+
+void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 16);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint32x4x4_t data =
+        {
+            {
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+            }
+        };
+        vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
+    },
+    in, out);
+}
+} // namespace
+
+NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
+    : _func(nullptr)
+{
+}
+
+void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
+
+    _input  = input;
+    _output = output;
+
+    unsigned int           num_elems_processed_per_iteration_x = 4;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            num_elems_processed_per_iteration_x = 8;
+            _func                               = &gemm_interleave_8bit_elements;
+            break;
+        case 2:
+            _func = &gemm_interleave_16bit_elements;
+            break;
+        case 4:
+            _func = &gemm_interleave_32bit_elements;
+            break;
+        default:
+            ARM_COMPUTE_ERROR_ON("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    update_window_and_padding(win, output_access, input_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMInterleave4x4Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    /*
+    *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+    *         |a00 a01 a02 a03|
+    *         |a10 a11 a12 a13|
+    *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+    *         |a30 a31 a32 a33|
+    *
+    *         After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+    */
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..3558c686b1
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _output_offset(0), _output_mult_int(0), _shift(0)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _a_offset        = a_offset;
+    _b_offset        = b_offset;
+    _output_offset   = output_offset;
+    _output_mult_int = output_mult_int;
+    _shift           = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowHorizontal in0_access(input0->info(), 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x);
+
+    update_window_and_padding(win, in0_access, in1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+    const size_t out_stride  = _output->info()->strides_in_bytes()[1];
+
+    /* Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix */
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1));
+
+    /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */
+    Window win_b(window);
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    /* The step x and step y for the output matrix has been already set using in configure() */
+    Iterator ina(_input0, win_a);
+    Iterator inb(_input1, win_b);
+    Iterator out(_output, window);
+
+    const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
+    const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
+    const int32x4_t vshiftr   = vdupq_n_s32(-_shift);
+
+    const int width_b = _input1->info()->dimension(0);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const uint8_t *mtx_a0 = ina.ptr();
+        const uint8_t *mtx_b0 = inb.ptr();
+
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 1
+        int32x4x4_t c1 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 2
+        int32x4x4_t c2 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 3
+        int32x4x4_t c3 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        int k = 0;
+        // This for loop performs 4 accumulations per iteration
+        for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0 + 0);
+            const uint8x8_t p01  = vld1_u8(mtx_a0 + 8);
+            const uint8x8_t q00l = vld1_u8(mtx_b0 + 0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+            const uint8x8_t q01l = vld1_u8(mtx_b0 + 16);
+            const uint8x8_t q01h = vld1_u8(mtx_b0 + 24);
+            const uint8x8_t q02l = vld1_u8(mtx_b0 + 32);
+            const uint8x8_t q02h = vld1_u8(mtx_b0 + 40);
+            const uint8x8_t q03l = vld1_u8(mtx_b0 + 48);
+            const uint8x8_t q03h = vld1_u8(mtx_b0 + 56);
+
+            const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+            const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00))));
+            const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01))));
+            const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01))));
+
+            const int32x2x4_t ia0 =
+            {
+                {
+                    vget_low_s32(ia0l),
+                    vget_high_s32(ia0l),
+                    vget_low_s32(ia0h),
+                    vget_high_s32(ia0h)
+                }
+            };
+
+            const int32x2x4_t ia1 =
+            {
+                {
+                    vget_low_s32(ia1l),
+                    vget_high_s32(ia1l),
+                    vget_low_s32(ia1h),
+                    vget_high_s32(ia1h)
+                }
+            };
+
+            const int32x4x4_t ib0 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+                }
+            };
+
+            const int32x4x4_t ib1 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h))))
+                }
+            };
+
+            const int32x4x4_t ib2 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h))))
+                }
+            };
+
+            const int32x4x4_t ib3 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h))))
+                }
+            };
+
+            // 4x4 block 0 - Accumulation 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1);
+            // 4x4 block 0 - Accumulation 1
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1);
+            // 4x4 block 0 - Accumulation 2
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1);
+            // 4x4 block 0 - Accumulation 3
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1);
+
+            // 4x4 block 1 - Accumulation 0
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1);
+            // 4x4 block 1 - Accumulation 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1);
+            // 4x4 block 1 - Accumulation 2
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1);
+            // 4x4 block 1 - Accumulation 3
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1);
+
+            // 4x4 block 2 - Accumulation 0
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1);
+            // 4x4 block 2 - Accumulation 1
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1);
+            // 4x4 block 2 - Accumulation 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1);
+            // 4x4 block 2 - Accumulation 3
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1);
+
+            // 4x4 block 3 - Accumulation 0
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1);
+            // 4x4 block 3 - Accumulation 1
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1);
+            // 4x4 block 3 - Accumulation 2
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1);
+            // 4x4 block 3 - Accumulation 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1);
+        }
+
+        // This for loop handles the left-over accumulations
+        for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0);
+            const uint8x8_t q00l = vld1_u8(mtx_b0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+
+            const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+
+            const int32x2x2_t ia =
+            {
+                {
+                    vget_low_s32(ia0),
+                    vget_high_s32(ia0)
+                }
+            };
+
+            const int32x4x4_t ib0 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+                }
+            };
+
+            // 4x4 block 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1);
+
+            // 4x4 block 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1);
+
+            // 4x4 block 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1);
+
+            // 4x4 block 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1);
+        }
+
+        c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr);
+        c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr);
+        c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr);
+        c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr);
+
+        c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr);
+        c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr);
+        c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr);
+        c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr);
+
+        c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr);
+        c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr);
+        c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr);
+        c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr);
+
+        c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr);
+        c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr);
+        c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr);
+        c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr);
+
+        const uint8x16x4_t r =
+        {
+            {
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3]))))
+            }
+        };
+
+        uint8_t *const mtx_out = out.ptr();
+        vst1q_u8(mtx_out + 0 * out_stride, r.val[0]);
+        vst1q_u8(mtx_out + 1 * out_stride, r.val[1]);
+        vst1q_u8(mtx_out + 2 * out_stride, r.val[2]);
+        vst1q_u8(mtx_out + 3 * out_stride, r.val[3]);
+    },
+    ina, inb, out);
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..7a3bae50c0
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
+                              biases_access);
+
+    AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
+
+    // Set the valid region for the accum tensor
+    Coordinates coord;
+    coord.set_num_dimensions(accum->info()->num_dimensions());
+    output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_biases;
+    win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+    win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in0_out(_accum, window);
+    Iterator in1(_biases, win_biases);
+
+    switch(_accum->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const float32x4x4_t accum  = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+                const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+                const float32x4x4_t res =
+                {
+                    {
+                        vaddq_f32(accum.val[0], biases.val[0]),
+                        vaddq_f32(accum.val[1], biases.val[1]),
+                        vaddq_f32(accum.val[2], biases.val[2]),
+                        vaddq_f32(accum.val[3], biases.val[3])
+                    }
+                };
+
+                vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+            },
+            in0_out, in1);
+            break;
+        }
+        case DataType::QS8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const qint8x16_t accum  = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
+                const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
+
+                vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
+            },
+            in0_out, in1);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..71dd4c7aa1
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float32x4_t beta_f32 = vdupq_n_f32(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+        float32x4x4_t alpha_ab =
+        {
+            {
+                vld1q_f32(out_ptr + 0),
+                vld1q_f32(out_ptr + 4),
+                vld1q_f32(out_ptr + 8),
+                vld1q_f32(out_ptr + 12)
+            }
+        };
+
+        const float32x4x4_t c =
+        {
+            {
+                vld1q_f32(in_ptr + 0),
+                vld1q_f32(in_ptr + 4),
+                vld1q_f32(in_ptr + 8),
+                vld1q_f32(in_ptr + 12)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+        alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+        alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+        alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+        vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
+        vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
+        vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+    },
+    in, out);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+        float16x8x2_t alpha_ab =
+        {
+            {
+                vld1q_f16(out_ptr + 0),
+                vld1q_f16(out_ptr + 8)
+            }
+        };
+
+        float16x8x2_t c =
+        {
+            {
+                vld1q_f16(in_ptr + 0),
+                vld1q_f16(in_ptr + 8)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+        alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+        vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+    },
+    in, out);
+}
+#endif
+
+void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const int        fixed_point_position = input->info()->fixed_point_position();
+    const qint8x16_t beta_qs8             = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const qint8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
+
+        qint8x16_t       alpha_ab = vld1q_qs8(out_ptr);
+        const qint8x16_t c        = vld1q_qs8(in_ptr);
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
+
+        vst1q_qs8(out_ptr, alpha_ab);
+    },
+    in, out);
+}
+} // namespace
+
+NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
+    : INESimpleKernel(), _func(nullptr), _beta(0.0f)
+{
+}
+
+void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = &matrix_addition_f32;
+            break;
+        case DataType::QS8:
+            _func = &matrix_addition_qs8;
+            break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &matrix_addition_f16;
+            break;
+#endif
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    _beta = beta;
+}
+
+void NEGEMMMatrixAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    if(_beta != 0.0f)
+    {
+        (*_func)(_input, _output, window, _beta);
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..dcfbb13081
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,1168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+template <bool multiply_alpha>
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * window.thread_id();
+    const int window_step_x  = 16 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float32x4_t acc0 = vdupq_n_f32(0.f);
+        float32x4_t acc1 = vdupq_n_f32(0.f);
+        float32x4_t acc2 = vdupq_n_f32(0.f);
+        float32x4_t acc3 = vdupq_n_f32(0.f);
+
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
+
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            float32x2_t a0l = vld1_f32(vec_a);
+
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float a0 = *vec_a;
+
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            acc0 = vmlaq_n_f32(acc0, b00, a0);
+            acc1 = vmlaq_n_f32(acc1, b01, a0);
+            acc2 = vmlaq_n_f32(acc2, b02, a0);
+            acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Multiply by the weight of matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+            acc0                        = vmulq_f32(acc0, alpha_f32);
+            acc1                        = vmulq_f32(acc1, alpha_f32);
+            acc2                        = vmulq_f32(acc2, alpha_f32);
+            acc3                        = vmulq_f32(acc3, alpha_f32);
+        }
+
+        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
+    const int  fixed_point_position = input0->info()->fixed_point_position();
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * window.thread_id();
+    const int window_step_x  = 32 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        // Reset accumulators
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+
+        auto vec_a    = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 2);)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
+            const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
+            const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
+            const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
+            const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+            acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8                 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 8x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t out_stride2          = out_stride1 * 2;
+    const size_t out_stride3          = out_stride1 * 3;
+    const int    num_elems_matrix_b_x = input1->info()->dimension(0);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        float32x4_t acc00 = vdupq_n_f32(0.f);
+        float32x4_t acc10 = vdupq_n_f32(0.f);
+        float32x4_t acc20 = vdupq_n_f32(0.f);
+        float32x4_t acc30 = vdupq_n_f32(0.f);
+
+        float32x4_t acc01 = vdupq_n_f32(0.f);
+        float32x4_t acc11 = vdupq_n_f32(0.f);
+        float32x4_t acc21 = vdupq_n_f32(0.f);
+        float32x4_t acc31 = vdupq_n_f32(0.f);
+
+#if __arm__
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
+        {
+            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0 = vld1q_dup_f32(mtx_a0 + 0);
+            a1 = vld1q_dup_f32(mtx_a0 + 1);
+            a2 = vld1q_dup_f32(mtx_a0 + 2);
+            a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+        }
+
+        for(; mtx_b0 < mtx_b0_end_addr;)
+        {
+            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            mtx_a0 += 4;
+            mtx_b0 += 4;
+            mtx_b1 += 4;
+        }
+
+        // Multiply by the weight of matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+            acc00                       = vmulq_f32(acc00, alpha_f32);
+            acc10                       = vmulq_f32(acc10, alpha_f32);
+            acc20                       = vmulq_f32(acc20, alpha_f32);
+            acc30                       = vmulq_f32(acc30, alpha_f32);
+            acc01                       = vmulq_f32(acc01, alpha_f32);
+            acc11                       = vmulq_f32(acc11, alpha_f32);
+            acc21                       = vmulq_f32(acc21, alpha_f32);
+            acc31                       = vmulq_f32(acc31, alpha_f32);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+        const auto mtx_out1 = mtx_out0 + 4;
+
+        // Store the 4 blocks
+        vst1q_f32(mtx_out0, acc00);
+        vst1q_f32(mtx_out1, acc01);
+        vst1q_f32(mtx_out0 + out_stride1, acc10);
+        vst1q_f32(mtx_out1 + out_stride1, acc11);
+        vst1q_f32(mtx_out0 + out_stride2, acc20);
+        vst1q_f32(mtx_out1 + out_stride2, acc21);
+        vst1q_f32(mtx_out0 + out_stride3, acc30);
+        vst1q_f32(mtx_out1 + out_stride3, acc31);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t out_stride  = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the output matrix
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // Number of iterations of inner loop. Since 8 is the number of accumulations per loop, num_it = (width_mtx_b / 4) / 8
+    const size_t num_it = ((input1->info()->dimension(0)) >> 2) >> 3;
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+        float16x8x4_t c =
+        {
+            {
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f)
+            }
+        };
+
+        /*
+        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+             |a00 a01 a02 a03 | a04 a05 a06 a07|
+             |a10 a11 a12 a13 | a14 a15 a16 a17|
+             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
+             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
+             |a40 a41 a42 a43 | a44 a45 a46 a47|
+             |a50 a51 a52 a53 | a54 a55 a56 a57|
+             |a60 a61 a62 a63 | a64 a65 a66 a67|
+             |a70 a71 a72 a73 | a74 a75 a76 a77|
+
+             After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+
+        B Matrix has been transposed as shown below
+
+           |b00 b01 b02 b03 b04 b05 b06 b07|
+           |b10 b11 b12 b13 b14 b15 b16 b17|
+           |b20 b21 b22 b23 b24 b25 b26 b27|
+           |b30 b31 b32 b33 b34 b35 b36 b37|
+          ------------------->
+
+           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
+
+            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
+            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
+
+        The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
+        */
+        for(size_t k = num_it; k > 0; mtx_a0 += 16, mtx_b0 += 32, --k)
+        {
+            const float16x8_t p00 = vld1q_f16(mtx_a0);
+            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+            const float16x8_t q00 = vld1q_f16(mtx_b0);
+            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+        }
+
+        if(multiply_alpha)
+        {
+            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+        }
+
+        vst1q_f16(mtx_out + 0 * out_stride, c.val[0]);
+        vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+    },
+    ina, inb, out);
+#else
+    ARM_COMPUTE_ERROR("Not implemented");
+#endif
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t    in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t    out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t    out_stride2          = out_stride1 * 2;
+    const size_t    out_stride3          = out_stride1 * 3;
+    const int       num_elems_matrix_b_x = input1->info()->dimension(0);
+    const int       fixed_point_position = input0->info()->fixed_point_position();
+    const qint8x8_t alpha_qs8            = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+    ARM_COMPUTE_UNUSED(alpha_qs8);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
+
+        int k = 0;
+        // This for loop performs 2 accumulations
+        for(; k <= (num_elems_matrix_b_x - 32); k += 32)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+            const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
+            const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
+            const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
+            const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+
+            const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
+            const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
+            const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
+            const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
+
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
+
+            mtx_a0 += 8;
+            mtx_b0 += 32;
+            mtx_b1 += 32;
+        }
+
+        // This for loop performs the left over accumulations
+        for(; k < num_elems_matrix_b_x; k += 16)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+            mtx_a0 += 4;
+            mtx_b0 += 16;
+            mtx_b1 += 16;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
+        qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
+        qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
+
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
+        qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
+        qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
+
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
+        qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
+        qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
+
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+        qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
+        qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
+        qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
+            acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
+            acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
+            acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
+            acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
+            acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
+            acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+            acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
+            acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
+            acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 32x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
+    },
+    ina, inb, out);
+}
+
+} // namespace
+
+NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _alpha(1.0f)
+{
+}
+
+void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    if(output->info()->dimension(1) == 1)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+    _alpha  = alpha;
+
+    unsigned int       num_elems_processed_per_iteration_x = 0;
+    const unsigned int num_elems_processed_per_iteration_y = 4;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->info()->dimension(1) == 1))
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                                  AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+                                  output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+        INEKernel::configure(win);
+    }
+    else
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                num_elems_processed_per_iteration_x = 8;
+                break;
+#endif
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win,
+                                  AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f),
+                                  AccessWindowTranspose(input1->info(), 0, 0, 4, 1, 0.f, 0.25f),
+                                  output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        INEKernel::configure(win);
+    }
+}
+
+void NEGEMMMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
+
+    // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
+    if((_output->info()->dimension(1) == 1))
+    {
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::QS8:
+            {
+                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+    }
+    else
+    {
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::QS8:
+            {
+                multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
+                break;
+#endif
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ccf5cb4de3
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstring>
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const float        scale_x                           = num_elems_processed_per_iteration;
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMTranspose1xWKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     */
+
+    // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, window);
+    Iterator out(_output, win_out);
+
+    switch(_input->info()->element_size())
+    {
+        case 1:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1];
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 16) + (x / 16 ) * stride
+                const uint8_t *in_ptr  = in.ptr();
+                uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
+                vst1q_u8(out_ptr, vld1q_u8(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 2:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 8) + (x / 8 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint16_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
+                vst1q_u16(out_ptr, vld1q_u16(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 4:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 4) + (x / 4 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint32_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
+                vst1q_u32(out_ptr, vld1q_u32(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..419f4825ef
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+BorderSize NEGaussian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEGaussian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const uint8_t *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const uint8_t *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const uint8_t *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    static const int16x8_t two  = vdupq_n_s16(2);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const int16x8x2_t top_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+        const int16x8x2_t mid_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+        const int16x8x2_t bot_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        int16x8_t out = top_s16.val[0];
+        //top mid
+        out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two);
+        //top right
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+        //mid left
+        out = vmlaq_s16(out, mid_s16.val[0], two);
+        //mid mid
+        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four);
+        //mid right
+        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+        //bot left
+        out = vaddq_s16(out, bot_s16.val[0]);
+        //bot mid
+        out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+        //bot right
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+        vst1_u8(output.ptr(), vqshrun_n_s16(out, 4));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..f872cc2f0a
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEGaussian5x5HorKernel::NEGaussian5x5HorKernel()
+    : _border_size(0)
+{
+}
+
+BorderSize NEGaussian5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEGaussian5x5HorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian5x5HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    static const int16x8_t six  = vdupq_n_s16(6);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vaddq_s16(data_s16.val[0], vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+BorderSize NEGaussian5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::S16);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 32;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian5x5VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const uint8_t *input_top2_ptr = _input->ptr_to_element(Coordinates(0, -2));
+    const uint8_t *input_top_ptr  = _input->ptr_to_element(Coordinates(0, -1));
+    const uint8_t *input_mid_ptr  = _input->ptr_to_element(Coordinates(0, 0));
+    const uint8_t *input_low_ptr  = _input->ptr_to_element(Coordinates(0, 1));
+    const uint8_t *input_low2_ptr = _input->ptr_to_element(Coordinates(0, 2));
+
+    const uint16x8_t six  = vdupq_n_u16(6);
+    const uint16x8_t four = vdupq_n_u16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t input_offset_high_s16 = input.offset();
+        const size_t input_offset_low_s16  = input.offset() + 16;
+
+        //HIGH DATA
+        //top2
+        uint16x8_t data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_high_s16)));
+        uint16x8_t out_high  = data_high;
+        //top
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, four);
+        //mid
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, six);
+        //low
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, four);
+        //low2
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_high_s16)));
+        out_high  = vaddq_u16(out_high, data_high);
+
+        //LOW DATA
+        //top2
+        uint16x8_t data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_low_s16)));
+        uint16x8_t out_low  = data_low;
+        //top
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, four);
+        //mid
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, six);
+        //low
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, four);
+        //low2
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_low_s16)));
+        out_low  = vaddq_u16(out_low, data_low);
+
+        vst1q_u8(output.ptr(), vcombine_u8(vqshrn_n_u16(out_high, 8),
+                                           vqshrn_n_u16(out_low, 8)));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..52d1fbf028
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel()
+    : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize NEGaussianPyramidHorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 32;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr float        scale_x                           = 0.5f;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+    // a pixel is even or odd is determined based on the tensor shape not the
+    // valid region!)
+    // Thus the offset from which the first pixel (L2) for the convolution is
+    // loaded depends on the anchor and shape of the valid region.
+    // In the case of an even shape (= even image width) we need to load L2
+    // from -2 if the anchor is odd and from -1 if the anchor is even. That
+    // makes sure that L2 is always loaded from an odd pixel.
+    // On the other hand, for an odd shape (= odd image width) we need to load
+    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+    // achieve the opposite effect.
+    // The condition can be simplified to checking whether anchor + shape is
+    // odd (-2) or even (-1) as only adding an odd and an even number will have
+    // an odd result.
+    _l2_load_offset = -border_size().left;
+
+    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+    {
+        _l2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEGaussianPyramidHorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
+
+    static const int16x8_t six  = vdupq_n_s16(6);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, _l2_load_offset);
+
+    Iterator in(_input, win_in);
+
+    // The output is half the width of the input
+    Window win_out(window);
+    win_out.scale(Window::DimX, 0.5f);
+
+    Iterator out(_output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16x2_t data_2q   = vld2q_u8(in.ptr());
+        const uint8x16_t &data_even = data_2q.val[0];
+        const uint8x16_t &data_odd  = data_2q.val[1];
+
+        const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even)));
+        const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd)));
+        const int16x8_t data_m  = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1))));
+        const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1))));
+        const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2))));
+
+        int16x8_t out_val = vaddq_s16(data_l2, data_r2);
+        out_val           = vmlaq_s16(out_val, data_l1, four);
+        out_val           = vmlaq_s16(out_val, data_m, six);
+        out_val           = vmlaq_s16(out_val, data_r1, four);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()), out_val);
+    },
+    in, out);
+}
+
+NEGaussianPyramidVertKernel::NEGaussianPyramidVertKernel()
+    : _t2_load_offset(0)
+{
+}
+
+BorderSize NEGaussianPyramidVertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_rows_processed_per_iteration  = 2;
+
+    constexpr unsigned int num_elems_written_per_iteration = 16;
+    constexpr unsigned int num_rows_written_per_iteration  = 1;
+
+    constexpr unsigned int num_elems_read_per_iteration = 16;
+    constexpr unsigned int num_rows_read_per_iteration  = 5;
+
+    constexpr float scale_y = 0.5f;
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), border_undefined, border_size());
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration, 1.f, scale_y);
+
+    // Determine whether we need to load even or odd rows. See above for a
+    // detailed explanation.
+    _t2_load_offset = -border_size().top;
+
+    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+    {
+        _t2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEGaussianPyramidVertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() != 16);
+    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    static const uint16x8_t six  = vdupq_n_u16(6);
+    static const uint16x8_t four = vdupq_n_u16(4);
+
+    Window win_in(window);
+    // Need to load two times 8 values instead of 16 values once
+    win_in.set_dimension_step(Window::DimX, 8);
+    win_in.shift(Window::DimY, _t2_load_offset);
+
+    Iterator in(_input, win_in);
+
+    // Output's height is half of input's
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.5f);
+
+    Iterator out(_output, win_out);
+
+    const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 0));
+    const uint8_t *input_top_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 1));
+    const uint8_t *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 2));
+    const uint8_t *input_low_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3));
+    const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Low data
+        const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
+        const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
+        const uint16x8_t data_low_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
+        const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
+        const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
+
+        uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2);
+        out_low            = vmlaq_u16(out_low, data_low_t1, four);
+        out_low            = vmlaq_u16(out_low, data_low_m, six);
+        out_low            = vmlaq_u16(out_low, data_low_b1, four);
+
+        in.increment(Window::DimX);
+
+        // High data
+        const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
+        const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
+        const uint16x8_t data_high_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
+        const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
+        const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
+
+        uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2);
+        out_high            = vmlaq_u16(out_high, data_high_t1, four);
+        out_high            = vmlaq_u16(out_high, data_high_m, six);
+        out_high            = vmlaq_u16(out_high, data_high_b1, four);
+
+        vst1q_u8(out.ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8)));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..404ad8a388
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstring>
+
+using namespace arm_compute;
+
+namespace
+{
+void cell_width_lt8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr,
+                    size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
+{
+    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
+    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
+    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
+    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
+    static const int32x4_t   one_s32      = vdupq_n_s32(1);
+    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
+
+    memset(output_ptr, 0, sizeof(float) * num_bins);
+
+    for(size_t yc = 0; yc < cell_height; ++yc)
+    {
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
+        {
+            // Load magnitude and phase values
+            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
+            const int16x4_t mag_s16  = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
+
+            // Convert magnitude and phase to float
+            const float32x4_t mag_f32   = vcvtq_f32_s32(vmovl_s16(mag_s16));
+            float32x4_t       phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
+
+            // Compute histogram index.
+            int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
+
+            // w1 = phase_f32 - hidx_f32
+            const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
+
+            // w0 = 1.0 - w1
+            const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
+
+            // Compute contribute for splitting vote
+            const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
+            const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to num_bins. If so, replace the index with 0
+            uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
+            hidx_s32        = vbslq_s32(mask, zero_s32, hidx_s32);
+
+            // Bin 0
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
+
+            hidx_s32 = vaddq_s32(hidx_s32, one_s32);
+
+            // Check if the histogram index is equal to num_bins
+            mask     = vceqq_s32(hidx_s32, num_bins_s32);
+            hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
+
+            // Bin1
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
+        }
+
+        for(; xc < static_cast<int32_t>(cell_width); ++xc)
+        {
+            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
+            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
+
+            const float w1 = phase_value - std::floor(phase_value);
+
+            // The quantised phase is the histogram index [0, num_bins - 1] - Round
+            // Check limit of histogram index. If hidx == num_bins, hidx = 0
+            const auto hidx = static_cast<size_t>(phase_value) % num_bins;
+
+            // Weighted vote between 2 bins
+            *(output_ptr + hidx) += mag_value * (1.0f - w1);
+            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
+        }
+    }
+}
+
+void cell_width_ge8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                    size_t cell_height, size_t num_bins, float phase_scale)
+{
+    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
+    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
+    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
+    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
+    static const int32x4_t   one_s32      = vdupq_n_s32(1);
+    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
+
+    memset(output_ptr, 0, sizeof(float) * num_bins);
+
+    for(size_t yc = 0; yc < cell_height; ++yc)
+    {
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
+        {
+            // Load magnitude and phase values
+            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
+            const int16x8_t mag_s16  = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
+
+            // Convert phase to U16
+            const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
+
+            // Convert magnitude to float32
+            const float32x4x2_t mag_f32 =
+            {
+                {
+                    vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
+                    vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
+                }
+            };
+
+            // Convert phase to float32
+            float32x4x2_t phase_f32 =
+            {
+                {
+                    vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
+                    vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
+                }
+            };
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
+            phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
+
+            // Compute histogram index.
+            int32x4x2_t hidx_s32 =
+            {
+                {
+                    vcvtq_s32_f32(phase_f32.val[0]),
+                    vcvtq_s32_f32(phase_f32.val[1])
+                }
+            };
+
+            // Compute magnitude weights (w0 and w1)
+            const float32x4x2_t hidx_f32 =
+            {
+                {
+                    vcvtq_f32_s32(hidx_s32.val[0]),
+                    vcvtq_f32_s32(hidx_s32.val[1])
+                }
+            };
+
+            float32x4x2_t w1_f32 =
+            {
+                {
+                    vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
+                    vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
+                }
+            };
+
+            float32x4x2_t w0_f32 =
+            {
+                {
+                    vsubq_f32(one_f32, w1_f32.val[0]),
+                    vsubq_f32(one_f32, w1_f32.val[1])
+                }
+            };
+
+            // Compute contribute for splitting vote
+            const float32x4x2_t mag_w0_f32 =
+            {
+                {
+                    vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
+                    vmulq_f32(mag_f32.val[1], w0_f32.val[1])
+                }
+            };
+
+            const float32x4x2_t mag_w1_f32 =
+            {
+                {
+                    vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
+                    vmulq_f32(mag_f32.val[1], w1_f32.val[1])
+                }
+            };
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to num_bins
+            uint32x4x2_t mask =
+            {
+                {
+                    vceqq_s32(hidx_s32.val[0], num_bins_s32),
+                    vceqq_s32(hidx_s32.val[1], num_bins_s32)
+                }
+            };
+
+            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
+            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
+
+            // First bin - Low
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
+
+            // First bin - high
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
+
+            hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
+            hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
+
+            // Check if the histogram index is equal to num_bins
+            mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
+            mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
+
+            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
+            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
+
+            // Second bin - Low
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
+
+            // Second bin - high
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
+        }
+
+        for(; xc < static_cast<int32_t>(cell_width); xc++)
+        {
+            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
+            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
+
+            const float w1 = phase_value - std::floor(phase_value);
+
+            // The quantised phase is the histogram index [0, num_bins - 1] - Round
+            // Check limit of histogram index. If hidx == num_bins, hidx = 0
+            const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
+
+            // Weighted vote between 2 bins
+            *(output_ptr + hidx) += mag_value * (1.0f - w1);
+            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
+        }
+    }
+}
+
+void l2_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride,
+             size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
+{
+    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
+
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L2-Norm
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute input_value^2
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        // Compute left over
+        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += input_value * input_value;
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    const float32x4_t scale_f32 = vdupq_n_f32(scale);
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        output_ptr[i] *= scale;
+    }
+}
+
+void l2hys_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                float l2_hyst_threshold)
+{
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L2-Hys
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute input_value^2
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        // Compute left over
+        for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += input_value * input_value;
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    float             scale                 = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    float32x4_t       scale_f32             = vdupq_n_f32(scale);
+    const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
+
+    // Reset sum
+    sum_f32 = vdupq_n_f32(0.0f);
+    sum     = 0.0f;
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        // Clip input_value if over _threshold_l2hys
+        input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
+        input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
+        input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
+        input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
+
+        // Compute input_value^2
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        float input_value = output_ptr[i] * scale;
+
+        // Clip scaled input_value if over _threshold_L2hys
+        input_value = std::min(input_value, l2_hyst_threshold);
+
+        sum += input_value * input_value;
+
+        output_ptr[i] = input_value;
+    }
+
+    // We use the same constants of OpenCV
+    scale     = 1.0f / (std::sqrt(sum) + 1e-3f);
+    scale_f32 = vdupq_n_f32(scale);
+
+    // Rescale
+    i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        // Store result
+        output_ptr[i] *= scale;
+    }
+}
+
+void l1_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+             float l2_hyst_threshold)
+{
+    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
+
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L1-Norm
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute |input_value|
+            sum_f32 += vabsq_f32(input_value.val[0]);
+            sum_f32 += vabsq_f32(input_value.val[1]);
+            sum_f32 += vabsq_f32(input_value.val[2]);
+            sum_f32 += vabsq_f32(input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += std::abs(input_value);
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    const float32x4_t scale_f32 = vdupq_n_f32(scale);
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        output_ptr[i] *= scale;
+    }
+}
+} // namespace
+
+NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
+    : _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
+{
+}
+
+void NEHOGOrientationBinningKernel::configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_width      = hog_info->cell_size().width;
+    _cell_height     = hog_info->cell_size().height;
+    _num_bins        = hog_info->num_bins();
+    _phase_scale     = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
+    _phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    if(_cell_width < 8)
+    {
+        _func = &cell_width_lt8;
+    }
+    else
+    {
+        _func = &cell_width_ge8;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int     num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = _cell_height;
+    const unsigned int     num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEHOGOrientationBinningKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const size_t mag_stride   = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
+    const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
+
+    Window win_mag(window);
+    win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
+    win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
+
+    Window win_phase(win_mag);
+
+    Iterator mag(_input_magnitude, win_mag);
+    Iterator phase(_input_phase, win_phase);
+    Iterator out(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto mag_row_ptr   = reinterpret_cast<const int16_t *>(mag.ptr());
+        const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
+        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
+
+        (*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
+    },
+    mag, phase, out);
+}
+
+NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
+{
+}
+
+void NEHOGBlockNormalizationKernel::configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _l2_hyst_threshold          = hog_info->l2_hyst_threshold();
+    _num_cells_per_block        = num_cells_per_block;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+    _num_bins                   = hog_info->num_bins();
+
+    ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
+
+    switch(hog_info->normalization_type())
+    {
+        case HOGNormType::L2_NORM:
+            _func = &l2_norm;
+            break;
+        case HOGNormType::L2HYS_NORM:
+            _func = &l2hys_norm;
+            break;
+        case HOGNormType::L1_NORM:
+            _func = &l1_norm;
+            break;
+        default:
+            ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int     num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = _num_cells_per_block.height;
+    const unsigned int     num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = _num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEHOGBlockNormalizationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get number of bins per block
+    const size_t num_bins_per_block = _output->info()->num_channels();
+
+    // Number of bins on the same row of the block
+    const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
+
+    const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
+
+    Window win_in(window);
+    win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+    win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    // Normalises blocks
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr());
+        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
+
+        // Execute normalization function
+        (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..4af22bca75
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEHOGDetectorKernel::NEHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0),
+      _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex()
+{
+}
+
+void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                       = input;
+    _detection_windows           = detection_windows;
+    _threshold                   = threshold;
+    _idx_class                   = idx_class;
+    _hog_descriptor              = hog->descriptor();
+    _bias                        = _hog_descriptor[hog->info()->descriptor_size() - 1];
+    _num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    _num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+    _block_stride_width          = block_stride.width;
+    _block_stride_height         = block_stride.height;
+    _detection_window_width      = detection_window_size.width;
+    _detection_window_height     = detection_window_size.height;
+    _max_num_detection_windows   = detection_windows->max_num_values();
+
+    ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHOGDetectorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr);
+
+    const size_t in_step_y = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
+
+    Iterator in(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto *in_row_ptr = reinterpret_cast<const float *>(in.ptr());
+
+        // Init score_f32 with 0
+        float32x4_t score_f32 = vdupq_n_f32(0.0f);
+
+        // Init score with bias
+        float score = _bias;
+
+        // Compute Linear SVM
+        for(size_t yb = 0; yb < _num_blocks_per_descriptor_y; ++yb, in_row_ptr += in_step_y)
+        {
+            int32_t xb = 0;
+
+            const int32_t offset_y = yb * _num_bins_per_descriptor_x;
+
+            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x) - 16; xb += 16)
+            {
+                // Load descriptor values
+                const float32x4x4_t a_f32 =
+                {
+                    {
+                        vld1q_f32(&in_row_ptr[xb + 0]),
+                        vld1q_f32(&in_row_ptr[xb + 4]),
+                        vld1q_f32(&in_row_ptr[xb + 8]),
+                        vld1q_f32(&in_row_ptr[xb + 12])
+                    }
+                };
+
+                // Load detector values
+                const float32x4x4_t b_f32 =
+                {
+                    {
+                        vld1q_f32(&_hog_descriptor[xb + 0 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 4 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 8 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 12 + offset_y])
+                    }
+                };
+
+                // Multiply accumulate
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[0], b_f32.val[0]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[1], b_f32.val[1]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[2], b_f32.val[2]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[3], b_f32.val[3]);
+            }
+
+            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x); ++xb)
+            {
+                const float a = in_row_ptr[xb];
+                const float b = _hog_descriptor[xb + offset_y];
+
+                score += a * b;
+            }
+        }
+
+        score += vgetq_lane_f32(score_f32, 0);
+        score += vgetq_lane_f32(score_f32, 1);
+        score += vgetq_lane_f32(score_f32, 2);
+        score += vgetq_lane_f32(score_f32, 3);
+
+        if(score > _threshold)
+        {
+            if(_detection_windows->num_values() < _max_num_detection_windows)
+            {
+                DetectionWindow win;
+                win.x         = (id.x() * _block_stride_width);
+                win.y         = (id.y() * _block_stride_height);
+                win.width     = _detection_window_width;
+                win.height    = _detection_window_height;
+                win.idx_class = _idx_class;
+                win.score     = score;
+
+                std::unique_lock<std::mutex> lock(_mutex);
+                _detection_windows->push_back(win);
+                lock.unlock();
+            }
+        }
+    },
+    in);
+}
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..585676bb87
--- /dev/null
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+using namespace arm_compute;
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+
+template class arm_compute::NEHarrisScoreFP16Kernel<3>;
+template class arm_compute::NEHarrisScoreFP16Kernel<5>;
+template class arm_compute::NEHarrisScoreFP16Kernel<7>;
+
+namespace fp16
+{
+inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.f);
+
+    // Trace^2
+    float16x8_t trace2 = vaddq_f16(gx2, gy2);
+    trace2             = vmulq_f16(trace2, trace2);
+
+    // Det(A)
+    float16x8_t det = vmulq_f16(gx2, gy2);
+    det             = vfmsq_f16(det, gxgy, gxgy);
+
+    // Det(A) - sensitivity * trace^2
+    const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
+
+    // mc > strength_thresh
+    const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
+
+    return vbslq_f16(mask, mc, zero);
+}
+
+template <size_t block_size>
+inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
+                                              float norm_factor)
+{
+    const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
+
+    // Normalize
+    low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
+    low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
+    high_gx = vmulq_f16(high_gx, norm_factor_fp16);
+    high_gy = vmulq_f16(high_gy, norm_factor_fp16);
+
+    float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
+    float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 1);
+    gy = vextq_f16(low_gy, high_gy, 1);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 2);
+    gy = vextq_f16(low_gy, high_gy, 2);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    if(block_size > 3)
+    {
+        gx = vextq_f16(low_gx, high_gx, 3);
+        gy = vextq_f16(low_gy, high_gy, 3);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 4);
+        gy = vextq_f16(low_gy, high_gy, 4);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+    }
+
+    if(block_size == 7)
+    {
+        gx = vextq_f16(low_gx, high_gx, 5);
+        gy = vextq_f16(low_gy, high_gy, 5);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 6);
+        gy = vextq_f16(low_gy, high_gy, 6);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+    }
+}
+
+template <size_t block_size>
+inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                       float strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = vdupq_n_f16(0.0f);
+    float16x8_t gy2  = vdupq_n_f16(0.0f);
+    float16x8_t gxgy = vdupq_n_f16(0.0f);
+
+    for(size_t i = 0; i < block_size; ++i)
+    {
+        const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
+        const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
+        const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
+        const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+template <size_t block_size>
+inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                       float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.0f);
+
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = zero;
+    float16x8_t gy2  = zero;
+    float16x8_t gxgy = zero;
+
+    for(size_t i = 0; i < block_size; ++i)
+    {
+        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
+        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
+                                                 vget_low_f16(zero));
+        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
+        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
+                                                 vget_low_f16(zero));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+        gx_ptr_2 += in_stride;
+        gy_ptr_2 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+template <>
+inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                          float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.0f);
+
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
+    const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = zero;
+    float16x8_t gy2  = zero;
+    float16x8_t gxgy = zero;
+
+    for(size_t i = 0; i < 7; ++i)
+    {
+        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
+        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
+                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
+        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
+        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
+                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+        gx_ptr_2 += in_stride;
+        gy_ptr_2 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+} // namespace fp16
+
+template <int32_t block_size>
+BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <int32_t block_size>
+NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
+    : INEHarrisScoreKernel(), _func(nullptr)
+{
+}
+
+template <int32_t block_size>
+void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
+    },
+    input1, input2, output);
+}
+
+template <int32_t block_size>
+void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
+                                                    bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    if(input1->info()->data_type() == DataType::S16)
+    {
+        _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
+    }
+    else
+    {
+        _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
+    }
+
+    ARM_COMPUTE_ERROR_ON(nullptr == _func);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = block_size;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+#endif
+
+template class arm_compute::NEHarrisScoreKernel<3>;
+template class arm_compute::NEHarrisScoreKernel<5>;
+template class arm_compute::NEHarrisScoreKernel<7>;
+template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
+template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
+template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
+
+namespace
+{
+inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
+{
+    // Trace^2
+    float32x4_t trace2 = vaddq_f32(gx2, gy2);
+    trace2             = vmulq_f32(trace2, trace2);
+
+    // Det(A)
+    float32x4_t det = vmulq_f32(gx2, gy2);
+    det             = vmlsq_f32(det, gxgy, gxgy);
+
+    // Det(A) - sensitivity * trace^2
+    const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
+
+    // mc > strength_thresh
+    const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
+
+    return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
+}
+
+inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
+                                              float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    const float32x4_t l_gx = low_gx;
+    const float32x4_t l_gy = low_gy;
+    const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
+    const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
+    const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
+    const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Gx*Gx
+    gx2 = vmlaq_f32(gx2, l_gx, l_gx);
+    gx2 = vmlaq_f32(gx2, m_gx, m_gx);
+    gx2 = vmlaq_f32(gx2, r_gx, r_gx);
+
+    // Gy*Gy
+    gy2 = vmlaq_f32(gy2, l_gy, l_gy);
+    gy2 = vmlaq_f32(gy2, m_gy, m_gy);
+    gy2 = vmlaq_f32(gy2, r_gy, r_gy);
+
+    // Gx*Gy
+    gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
+    gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
+    gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
+}
+
+inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
+                                              float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // L2 values
+    float32x4_t gx = low_gx;
+    float32x4_t gy = low_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L1 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // M values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R1 values
+    gx = vextq_f32(low_gx, high_gx, 3);
+    gy = vextq_f32(low_gy, high_gy, 3);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R2 values
+    gx = high_gx;
+    gy = high_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+}
+
+inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
+                                              float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // L3 values
+    float32x4_t gx = low_gx;
+    float32x4_t gy = low_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L2 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L1 values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // M values
+    gx = vextq_f32(low_gx, high_gx, 3);
+    gy = vextq_f32(low_gy, high_gy, 3);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R1 values
+    gx = high_gx;
+    gy = high_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // Change tmp_low and tmp_high for calculating R2 and R3 values
+    low_gx  = high_gx;
+    low_gy  = high_gy;
+    high_gx = high_gx1;
+    high_gy = high_gy1;
+
+    // Normalize
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // R2 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R3 values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+}
+
+inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+
+{
+    const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
+    const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+
+    // Row0
+    int16x8x2_t tmp_gx =
+    {
+        {
+            vld1q_s16(gx_ptr_0 - input_stride),
+            vld1q_s16(gx_ptr_1 - input_stride)
+        }
+    };
+    int16x8x2_t tmp_gy =
+    {
+        {
+            vld1q_s16(gy_ptr_0 - input_stride),
+            vld1q_s16(gy_ptr_1 - input_stride)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row1
+    tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
+    tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
+    tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
+    tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row2
+    tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
+    tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
+    tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
+    tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
+    auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
+    const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
+    const auto     output          = static_cast<float *__restrict>(output_ptr);
+    float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+
+    // Row0
+    float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
+    float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
+    float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
+    float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row1
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row2
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 5; ++i)
+    {
+        const int16x8x2_t tmp_gx =
+        {
+            {
+                vld1q_s16(gx_ptr_0),
+                vld1q_s16(gx_ptr_1)
+            }
+        };
+        const int16x8x2_t tmp_gy =
+        {
+            {
+                vld1q_s16(gy_ptr_0),
+                vld1q_s16(gy_ptr_1)
+            }
+        };
+
+        float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+        float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+        float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+        float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+        low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+        low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+        high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+        high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+
+{
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 5; ++i)
+    {
+        const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+        const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+        const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+        const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+        const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+        gx_ptr_2 += input_stride;
+        gy_ptr_2 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4_t gx2             = vdupq_n_f32(0.0f);
+    float32x4_t gy2             = vdupq_n_f32(0.0f);
+    float32x4_t gxgy            = vdupq_n_f32(0.0f);
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 7; ++i)
+    {
+        const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
+        const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
+        const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
+        const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
+
+        float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
+        float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
+        float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
+        float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
+        float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
+        float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
+        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output, mc);
+}
+
+inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
+    const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4_t gx2             = vdupq_n_f32(0.0f);
+    float32x4_t gy2             = vdupq_n_f32(0.0f);
+    float32x4_t gxgy            = vdupq_n_f32(0.0f);
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 7; ++i)
+    {
+        const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+        const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+        const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+        const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+        gx_ptr_2 += input_stride;
+        gy_ptr_2 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output, mc);
+}
+
+} // namespace
+
+INEHarrisScoreKernel::INEHarrisScoreKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
+{
+}
+
+template <int32_t block_size>
+NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
+    : INEHarrisScoreKernel(), _func(nullptr)
+{
+}
+
+template <int32_t block_size>
+void NEHarrisScoreKernel<block_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
+    },
+    input1, input2, output);
+}
+
+template <int32_t block_size>
+BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <int32_t block_size>
+void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
+                                                bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    if(input1->info()->data_type() == DataType::S16)
+    {
+        switch(block_size)
+        {
+            case 3:
+                _func = &harris_score3x3_S16_S16_FLOAT;
+                break;
+            case 5:
+                _func = &harris_score5x5_S16_S16_FLOAT;
+                break;
+            case 7:
+                _func = &harris_score7x7_S16_S16_FLOAT;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid block size");
+                break;
+        }
+    }
+    else
+    {
+        switch(block_size)
+        {
+            case 3:
+                _func = &harris_score3x3_S32_S32_FLOAT;
+                break;
+            case 5:
+                _func = &harris_score5x5_S32_S32_FLOAT;
+                break;
+            case 7:
+                _func = &harris_score7x7_S32_S32_FLOAT;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid block size");
+                break;
+        }
+    }
+
+    ARM_COMPUTE_ERROR_ON(nullptr == _func);
+
+    constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
+    constexpr unsigned int num_elems_read_per_iteration      = block_size != 7 ? 16 : 12;
+    constexpr unsigned int num_elems_written_per_iteration   = block_size != 7 ? 8 : 4;
+    constexpr unsigned int num_rows_read_per_iteration       = block_size;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
new file mode 100644
index 0000000000..9e967ec4f5
--- /dev/null
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
+{
+    std::lock_guard<std::mutex> lock(_hist_mtx);
+
+    const unsigned int v_end = (bins / 4) * 4;
+
+    for(unsigned int b = 0; b < v_end; b += 4)
+    {
+        const uint32x4_t tmp_global = vld1q_u32(global_hist + b);
+        const uint32x4_t tmp_local  = vld1q_u32(local_hist + b);
+        vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local));
+    }
+
+    for(unsigned int b = v_end; b < bins; ++b)
+    {
+        global_hist[b] += local_hist[b];
+    }
+}
+
+NEHistogramKernel::NEHistogramKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
+{
+}
+
+void NEHistogramKernel::histogram_U8(Window win)
+{
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+
+    const size_t          bins       = _output->num_bins();
+    const int32_t         offset     = _output->offset();
+    const uint32_t        offrange   = offset + _output->range();
+    const uint32_t *const w_lut      = _window_lut;
+    uint32_t *const       local_hist = _local_hist + win.thread_id() * bins;
+
+    // Clear local_histogram
+    std::fill_n(local_hist, bins, 0);
+
+    auto update_local_hist = [&](uint8_t p)
+    {
+        if(offset <= p && p < offrange)
+        {
+            ++local_hist[w_lut[p]];
+        }
+    };
+
+    const unsigned int x_start = win.x().start();
+    const unsigned int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over
+    // pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win);
+
+    // Calculate local histogram
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        unsigned int x = x_start;
+
+        // Vector loop
+        for(; x <= x_end - 8; x += 8)
+        {
+            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
+
+            update_local_hist(vget_lane_u8(pixels, 0));
+            update_local_hist(vget_lane_u8(pixels, 1));
+            update_local_hist(vget_lane_u8(pixels, 2));
+            update_local_hist(vget_lane_u8(pixels, 3));
+            update_local_hist(vget_lane_u8(pixels, 4));
+            update_local_hist(vget_lane_u8(pixels, 5));
+            update_local_hist(vget_lane_u8(pixels, 6));
+            update_local_hist(vget_lane_u8(pixels, 7));
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            update_local_hist(input.ptr()[x]);
+        }
+    },
+    input);
+
+    // Merge histograms
+    merge_histogram(_output->buffer(), local_hist, bins);
+}
+
+void NEHistogramKernel::histogram_fixed_U8(Window win)
+{
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+
+    std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
+
+    const unsigned int x_start = win.x().start();
+    const unsigned int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over
+    // pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win);
+
+    // Calculate local histogram
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        unsigned int x = x_start;
+
+        // Vector loop
+        for(; x <= x_end - 8; x += 8)
+        {
+            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
+
+            ++local_hist[vget_lane_u8(pixels, 0)];
+            ++local_hist[vget_lane_u8(pixels, 1)];
+            ++local_hist[vget_lane_u8(pixels, 2)];
+            ++local_hist[vget_lane_u8(pixels, 3)];
+            ++local_hist[vget_lane_u8(pixels, 4)];
+            ++local_hist[vget_lane_u8(pixels, 5)];
+            ++local_hist[vget_lane_u8(pixels, 6)];
+            ++local_hist[vget_lane_u8(pixels, 7)];
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            ++local_hist[input.ptr()[x]];
+        }
+    },
+    input);
+
+    // Merge histograms
+    merge_histogram(_output->buffer(), local_hist.data(), _max_range_size);
+}
+
+void NEHistogramKernel::calculate_window_lut() const
+{
+    const int32_t  offset = _output->offset();
+    const size_t   bins   = _output->num_bins();
+    const uint32_t range  = _output->range();
+
+    std::fill_n(_window_lut, offset, 0);
+
+    for(unsigned int p = offset; p < _max_range_size; ++p)
+    {
+        _window_lut[p] = ((p - offset) * bins) / range;
+    }
+}
+
+void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == local_hist);
+    ARM_COMPUTE_ERROR_ON(nullptr == window_lut);
+
+    _input      = input;
+    _output     = output;
+    _local_hist = local_hist;
+    _window_lut = window_lut;
+
+    //Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast<int32_t>(_max_range_size), "Offset is larger than the image value range.");
+
+    //Check range
+    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->range()) > static_cast<int32_t>(_max_range_size) /* max range */, "Range larger than the image value range.");
+
+    // Calculate LUT
+    calculate_window_lut();
+
+    // Set appropriate function
+    _func = &NEHistogramKernel::histogram_U8;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    _input  = input;
+    _output = output;
+
+    // Set appropriate function
+    _func = &NEHistogramKernel::histogram_fixed_U8;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHistogramKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
new file mode 100644
index 0000000000..c7c23d5d06
--- /dev/null
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T, bool has_pads>
+inline void linearize_volume(const uint8_t *const in_ptr,
+                             T                   *out_ptr,
+                             bool                 has_bias,
+                             int                  top_left_x,
+                             int                  top_left_y,
+                             int                  kernel_size,
+                             int                  kernel_depth,
+                             int                  input_w,
+                             int                  input_h,
+                             int                  input_stride_x,
+                             int                  input_stride_y,
+                             int                  input_stride_z,
+                             int                  fixed_point_position)
+{
+    const int kernel_size2 = kernel_size * kernel_size;
+    const int x_e          = top_left_x + kernel_size;
+    const int y_e          = top_left_y + kernel_size;
+
+    // Linearize volume
+    int d = 0;
+    // This for loop linearize a volume with 3 slices. This allows:
+    // 1) to reduce the iterations of the outer for loop "d"
+    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+    for(; d <= (kernel_depth - 3); d += 3)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    *(out_ptr + 0 * kernel_size2) = 0;
+                    *(out_ptr + 1 * kernel_size2) = 0;
+                    *(out_ptr + 2 * kernel_size2) = 0;
+                }
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *(out_ptr + 0 * kernel_size2) = 0;
+                        *(out_ptr + 1 * kernel_size2) = 0;
+                        *(out_ptr + 2 * kernel_size2) = 0;
+                    }
+                    else
+                    {
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+        out_ptr += 2 * kernel_size2;
+    }
+
+    // Left over
+    for(; d < kernel_depth; d++)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                memset(out_ptr, 0, kernel_size * sizeof(T));
+                out_ptr += kernel_size;
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *out_ptr = 0;
+                    }
+                    else
+                    {
+                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if(has_bias)
+    {
+        if(std::is_same<T, arm_compute::qint8_t>::value)
+        {
+            *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+        }
+        else
+        {
+            *out_ptr = static_cast<T>(1);
+        }
+    }
+}
+} // namespace
+
+template <typename T, bool has_pads>
+void NEIm2ColKernel::run_generic(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int kernel_depth   = _input->info()->dimension(2);
+    const int input_w        = _input->info()->dimension(0);
+    const int input_h        = _input->info()->dimension(1);
+    const int input_stride_x = _input->info()->strides_in_bytes().x();
+    const int input_stride_y = _input->info()->strides_in_bytes().y();
+    const int input_stride_z = _input->info()->strides_in_bytes().z();
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    std::tie(stride_x, stride_y) = _conv_info.stride();
+
+    // Setup input window
+    const int start_x = -pad_x;
+    const int start_y = -pad_y;
+
+    Window window_in(window);
+    // The first three dimensions of the input are increased by the inner loops
+    window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output window
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size()));
+    window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first));
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(_input, window_in);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int top_left_x = id.x() * stride_x + start_x;
+        const int top_left_y = id.y() * stride_y + start_y;
+
+        // Get pointers
+        const uint8_t *const input_ptr  = in.ptr();
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
+
+        // Linearize volume
+        linearize_volume<T, has_pads>(input_ptr,
+                                      output_ptr,
+                                      _has_bias,
+                                      top_left_x,
+                                      top_left_y,
+                                      static_cast<int>(_kernel_size),
+                                      kernel_depth,
+                                      input_w,
+                                      input_h,
+                                      input_stride_x,
+                                      input_stride_y,
+                                      input_stride_z,
+                                      _input->info()->fixed_point_position());
+    },
+    in, out);
+}
+
+template <typename T>
+void NEIm2ColKernel::run_reduced(const Window &window)
+{
+    const size_t in_width   = _input->info()->dimension(0);
+    const size_t in_height  = _input->info()->dimension(1);
+    const size_t out_step_x = in_width * _input->info()->element_size();
+    const size_t out_step_y = out_step_x * in_height;
+    const size_t out_width  = _output->info()->dimension(0);
+
+    Window in_window(window);
+    in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+    out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
+
+    Window in_slice  = in_window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    do
+    {
+        Iterator in(_input, in_slice);
+        Iterator out(_output, out_slice);
+
+        uint8_t *out_ptr = out.ptr();
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
+        },
+        in);
+
+        // Add bias
+        if(_has_bias)
+        {
+            if(std::is_same<T, arm_compute::qint8_t>::value)
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+            }
+            else
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
+            }
+        }
+    }
+    while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
+
+NEIm2ColKernel::NEIm2ColKernel()
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _has_bias(false)
+{
+}
+
+void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+    _conv_info      = conv_info;
+    _kernel_size    = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+    _has_bias       = has_bias;
+
+    unsigned int pad_x, pad_y, stride_x, stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                               && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                              input->info()->tensor_shape().cend(),
+                                              output->info()->tensor_shape().cbegin() + 1))
+                               && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    Window window = calculate_max_window(*input->info(), Steps());
+
+    if(run_img2col_reduced)
+    {
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = &NEIm2ColKernel::run_reduced<float>;
+                break;
+            case DataType::QS8:
+                _func = &NEIm2ColKernel::run_reduced<qint8_t>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+    }
+    else
+    {
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                break;
+            case DataType::QS8:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+        window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
+        window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
+        window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    }
+
+    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IKernel::configure(window);
+}
+
+void NEIm2ColKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
new file mode 100644
index 0000000000..3b09a1bdbb
--- /dev/null
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+void NEIntegralImageKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    // The kernel is effectively reading 17 values from -1 as it loads 16
+    // starting at -1 and also 16 starting at 0
+    AccessWindowRectangle  output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1);
+    AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_read_access, output_write_access);
+
+    output_write_access.set_valid_region(win, input->info()->valid_region());
+
+    IKernel::configure(win);
+}
+
+BorderSize NEIntegralImageKernel::border_size() const
+{
+    return BorderSize(1, 0, 0, 1);
+}
+
+bool NEIntegralImageKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEIntegralImageKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const auto output_top_left = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(-1, -1)));
+    const auto output_top_mid  = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(0, -1)));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t input_pixels = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t tmp =
+        {
+            {
+                vmovl_u8(vget_low_u8(input_pixels)),
+                vmovl_u8(vget_high_u8(input_pixels))
+            }
+        };
+
+        uint32x4x4_t pixels =
+        {
+            {
+                vmovl_u16(vget_low_u16(tmp.val[0])),
+                vmovl_u16(vget_high_u16(tmp.val[0])),
+                vmovl_u16(vget_low_u16(tmp.val[1])),
+                vmovl_u16(vget_high_u16(tmp.val[1]))
+            }
+        };
+
+        // Divide by four as pointer is now uint32 instead of uint8!
+        const size_t off = output.offset() / 4;
+
+        // Add top mid pixel values
+        const uint32_t *const top_mid_ptr = output_top_mid + off;
+
+        pixels.val[0] = vaddq_u32(vld1q_u32(top_mid_ptr), pixels.val[0]);
+        pixels.val[1] = vaddq_u32(vld1q_u32(top_mid_ptr + 4), pixels.val[1]);
+        pixels.val[2] = vaddq_u32(vld1q_u32(top_mid_ptr + 8), pixels.val[2]);
+        pixels.val[3] = vaddq_u32(vld1q_u32(top_mid_ptr + 12), pixels.val[3]);
+
+        // Subtract top left diagonal values
+        const auto            outptr       = reinterpret_cast<uint32_t *>(output.ptr());
+        const uint32_t *const top_left_ptr = output_top_left + off;
+
+        pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(top_left_ptr));
+        vst1q_u32(outptr, pixels.val[0]);
+
+        pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(top_left_ptr + 4));
+        vst1q_u32(outptr + 4, pixels.val[1]);
+
+        pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(top_left_ptr + 8));
+        vst1q_u32(outptr + 8, pixels.val[2]);
+
+        pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(top_left_ptr + 12));
+        vst1q_u32(outptr + 12, pixels.val[3]);
+
+        // Perform prefix summation
+        for(auto i = 0; i < 16; ++i)
+        {
+            outptr[i] += outptr[i - 1];
+        }
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
new file mode 100644
index 0000000000..3d2bfb204e
--- /dev/null
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+
+using namespace arm_compute;
+
+/** Constants used for Lucas-Kanade Algorithm */
+constexpr int   W_BITS                = 14;
+constexpr float D0                    = 1 << W_BITS;
+constexpr float DETERMINANT_THRESHOLD = 1.0e-07f; // Threshold for the determinant. Used for lost tracking criteria
+constexpr float EIGENVALUE_THRESHOLD  = 1.0e-04f; // Thresholds for minimum eigenvalue. Used for lost tracking criteria
+constexpr float FLT_SCALE             = 1.0f / (1 << 20);
+
+namespace
+{
+enum class BilinearInterpolation
+{
+    BILINEAR_OLD_NEW,
+    BILINEAR_SCHARR
+};
+
+template <typename T>
+constexpr int INT_ROUND(T x, int n)
+{
+    return (x + (1 << (n - 1))) >> n;
+}
+
+template <typename T>
+inline int get_pixel(const ITensor *tensor, int xi, int yi, int iw00, int iw01, int iw10, int iw11, int scale)
+{
+    const auto px00 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi)));
+    const auto px01 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi)));
+    const auto px10 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi + 1)));
+    const auto px11 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi + 1)));
+
+    return INT_ROUND(px00 * iw00 + px01 * iw01 + px10 * iw10 + px11 * iw11, scale);
+}
+
+inline int32x4_t compute_bilinear_interpolation(int16x8_t top_row, int16x8_t bottom_row, int16x4_t w00, int16x4_t w01, int16x4_t w10, int16x4_t w11, int32x4_t shift)
+{
+    // Get the left column of upper row
+    const int16x4_t px00 = vget_low_s16(top_row);
+
+    // Get the right column of upper row
+    const int16x4_t px01 = vext_s16(px00, vget_high_s16(top_row), 1);
+
+    // Get the left column of lower row
+    const int16x4_t px10 = vget_low_s16(bottom_row);
+
+    // Get the right column of right row
+    const int16x4_t px11 = vext_s16(px10, vget_high_s16(bottom_row), 1);
+
+    // Apply the bilinear filter
+    return vqrshlq_s32(vmull_s16(px00, w00) + vmull_s16(px01, w01) + vmull_s16(px10, w10) + vmull_s16(px11, w11), shift);
+}
+} // namespace
+
+void NELKTrackerKernel::init_keypoints(int start, int end)
+{
+    if(_level == _num_levels - 1)
+    {
+        const float level_scale = pow(_pyramid_scale, _level);
+
+        for(int i = start; i < end; ++i)
+        {
+            _old_points_internal->at(i).x               = _old_points->at(i).x * level_scale;
+            _old_points_internal->at(i).y               = _old_points->at(i).y * level_scale;
+            _old_points_internal->at(i).tracking_status = true;
+
+            NELKInternalKeypoint keypoint_to_track;
+
+            if(_use_initial_estimate)
+            {
+                keypoint_to_track.x               = _new_points_estimates->at(i).x * level_scale;
+                keypoint_to_track.y               = _new_points_estimates->at(i).y * level_scale;
+                keypoint_to_track.tracking_status = (_new_points_estimates->at(i).tracking_status == 1);
+            }
+            else
+            {
+                keypoint_to_track.x               = _old_points_internal->at(i).x;
+                keypoint_to_track.y               = _old_points_internal->at(i).y;
+                keypoint_to_track.tracking_status = true;
+            }
+
+            _new_points_internal->at(i) = keypoint_to_track;
+        }
+    }
+    else
+    {
+        for(int i = start; i < end; ++i)
+        {
+            _old_points_internal->at(i).x /= _pyramid_scale;
+            _old_points_internal->at(i).y /= _pyramid_scale;
+            _new_points_internal->at(i).x /= _pyramid_scale;
+            _new_points_internal->at(i).y /= _pyramid_scale;
+        }
+    }
+}
+
+std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy)
+{
+    int iA11 = 0;
+    int iA12 = 0;
+    int iA22 = 0;
+
+    int32x4_t nA11 = vdupq_n_s32(0);
+    int32x4_t nA12 = vdupq_n_s32(0);
+    int32x4_t nA22 = vdupq_n_s32(0);
+
+    float keypoint_int_x = 0;
+    float keypoint_int_y = 0;
+
+    const float wx = std::modf(keypoint.x, &keypoint_int_x);
+    const float wy = std::modf(keypoint.y, &keypoint_int_y);
+
+    const int iw00 = roundf((1.0f - wx) * (1.0f - wy) * D0);
+    const int iw01 = roundf(wx * (1.0f - wy) * D0);
+    const int iw10 = roundf((1.0f - wx) * wy * D0);
+    const int iw11 = D0 - iw00 - iw01 - iw10;
+
+    const int16x4_t nw00 = vdup_n_s16(iw00);
+    const int16x4_t nw01 = vdup_n_s16(iw01);
+    const int16x4_t nw10 = vdup_n_s16(iw10);
+    const int16x4_t nw11 = vdup_n_s16(iw11);
+
+    // Convert stride from uint_t* to int16_t*
+    const size_t           row_stride = _old_scharr_gx->info()->strides_in_bytes()[1] / 2;
+    const Coordinates      top_left_window_corner(static_cast<int>(keypoint_int_x) - _window_dimension / 2, static_cast<int>(keypoint_int_y) - _window_dimension / 2);
+    auto                   idx             = reinterpret_cast<const int16_t *>(_old_scharr_gx->buffer() + _old_scharr_gx->info()->offset_element_in_bytes(top_left_window_corner));
+    auto                   idy             = reinterpret_cast<const int16_t *>(_old_scharr_gy->buffer() + _old_scharr_gy->info()->offset_element_in_bytes(top_left_window_corner));
+    static const int32x4_t nshifter_scharr = vdupq_n_s32(-W_BITS);
+
+    for(int ky = 0; ky < _window_dimension; ++ky, idx += row_stride, idy += row_stride)
+    {
+        int kx = 0;
+
+        // Calculate elements in blocks of four as long as possible
+        for(; kx <= _window_dimension - 4; kx += 4)
+        {
+            // Interpolation X
+            const int16x8_t ndx_row1 = vld1q_s16(idx + kx);
+            const int16x8_t ndx_row2 = vld1q_s16(idx + kx + row_stride);
+
+            const int32x4_t nxval = compute_bilinear_interpolation(ndx_row1, ndx_row2, nw00, nw01, nw10, nw11, nshifter_scharr);
+
+            // Interpolation Y
+            const int16x8_t ndy_row1 = vld1q_s16(idy + kx);
+            const int16x8_t ndy_row2 = vld1q_s16(idy + kx + row_stride);
+
+            const int32x4_t nyval = compute_bilinear_interpolation(ndy_row1, ndy_row2, nw00, nw01, nw10, nw11, nshifter_scharr);
+
+            // Store the intermediate data so that we don't need to recalculate them in later stage
+            vst1q_s32(bilinear_ix + kx + ky * _window_dimension, nxval);
+            vst1q_s32(bilinear_iy + kx + ky * _window_dimension, nyval);
+
+            // Accumulate Ix^2
+            nA11 = vmlaq_s32(nA11, nxval, nxval);
+            // Accumulate Ix * Iy
+            nA12 = vmlaq_s32(nA12, nxval, nyval);
+            // Accumulate Iy^2
+            nA22 = vmlaq_s32(nA22, nyval, nyval);
+        }
+
+        // Calculate the leftover elements
+        for(; kx < _window_dimension; ++kx)
+        {
+            const int32_t ixval = get_pixel<int16_t>(_old_scharr_gx, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky,
+                                                     iw00, iw01, iw10, iw11, W_BITS);
+            const int32_t iyval = get_pixel<int16_t>(_old_scharr_gy, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky,
+                                                     iw00, iw01, iw10, iw11, W_BITS);
+
+            iA11 += ixval * ixval;
+            iA12 += ixval * iyval;
+            iA22 += iyval * iyval;
+
+            bilinear_ix[kx + ky * _window_dimension] = ixval;
+            bilinear_iy[kx + ky * _window_dimension] = iyval;
+        }
+    }
+
+    iA11 += vgetq_lane_s32(nA11, 0) + vgetq_lane_s32(nA11, 1) + vgetq_lane_s32(nA11, 2) + vgetq_lane_s32(nA11, 3);
+    iA12 += vgetq_lane_s32(nA12, 0) + vgetq_lane_s32(nA12, 1) + vgetq_lane_s32(nA12, 2) + vgetq_lane_s32(nA12, 3);
+    iA22 += vgetq_lane_s32(nA22, 0) + vgetq_lane_s32(nA22, 1) + vgetq_lane_s32(nA22, 2) + vgetq_lane_s32(nA22, 3);
+
+    return std::make_tuple(iA11, iA12, iA22);
+}
+
+std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy)
+{
+    int ib1 = 0;
+    int ib2 = 0;
+
+    int32x4_t nb1 = vdupq_n_s32(0);
+    int32x4_t nb2 = vdupq_n_s32(0);
+
+    // Compute weights for the old keypoint
+    float old_keypoint_int_x = 0;
+    float old_keypoint_int_y = 0;
+
+    const float old_wx = std::modf(old_keypoint.x, &old_keypoint_int_x);
+    const float old_wy = std::modf(old_keypoint.y, &old_keypoint_int_y);
+
+    const int iw00_old = roundf((1.0f - old_wx) * (1.0f - old_wy) * D0);
+    const int iw01_old = roundf(old_wx * (1.0f - old_wy) * D0);
+    const int iw10_old = roundf((1.0f - old_wx) * old_wy * D0);
+    const int iw11_old = D0 - iw00_old - iw01_old - iw10_old;
+
+    const int16x4_t nw00_old = vdup_n_s16(iw00_old);
+    const int16x4_t nw01_old = vdup_n_s16(iw01_old);
+    const int16x4_t nw10_old = vdup_n_s16(iw10_old);
+    const int16x4_t nw11_old = vdup_n_s16(iw11_old);
+
+    // Compute weights for the new keypoint
+    float new_keypoint_int_x = 0;
+    float new_keypoint_int_y = 0;
+
+    const float new_wx = std::modf(new_keypoint.x, &new_keypoint_int_x);
+    const float new_wy = std::modf(new_keypoint.y, &new_keypoint_int_y);
+
+    const int iw00_new = roundf((1.0f - new_wx) * (1.0f - new_wy) * D0);
+    const int iw01_new = roundf(new_wx * (1.0f - new_wy) * D0);
+    const int iw10_new = roundf((1.0f - new_wx) * new_wy * D0);
+    const int iw11_new = D0 - iw00_new - iw01_new - iw10_new;
+
+    const int16x4_t nw00_new = vdup_n_s16(iw00_new);
+    const int16x4_t nw01_new = vdup_n_s16(iw01_new);
+    const int16x4_t nw10_new = vdup_n_s16(iw10_new);
+    const int16x4_t nw11_new = vdup_n_s16(iw11_new);
+
+    const int              row_stride = _input_new->info()->strides_in_bytes()[1];
+    const Coordinates      top_left_window_corner_old(static_cast<int>(old_keypoint_int_x) - _window_dimension / 2, static_cast<int>(old_keypoint_int_y) - _window_dimension / 2);
+    const Coordinates      top_left_window_corner_new(static_cast<int>(new_keypoint_int_x) - _window_dimension / 2, static_cast<int>(new_keypoint_int_y) - _window_dimension / 2);
+    const uint8_t         *old_ptr         = _input_old->buffer() + _input_old->info()->offset_element_in_bytes(top_left_window_corner_old);
+    const uint8_t         *new_ptr         = _input_new->buffer() + _input_new->info()->offset_element_in_bytes(top_left_window_corner_new);
+    static const int32x4_t nshifter_tensor = vdupq_n_s32(-(W_BITS - 5));
+
+    for(int ky = 0; ky < _window_dimension; ++ky, new_ptr += row_stride, old_ptr += row_stride)
+    {
+        int kx = 0;
+
+        // Calculate elements in blocks of four as long as possible
+        for(; kx <= _window_dimension - 4; kx += 4)
+        {
+            // Interpolation old tensor
+            const int16x8_t nold_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx)));
+            const int16x8_t nold_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx + row_stride)));
+
+            const int32x4_t noldval = compute_bilinear_interpolation(nold_row1, nold_row2, nw00_old, nw01_old, nw10_old, nw11_old, nshifter_tensor);
+
+            // Interpolation new tensor
+            const int16x8_t nnew_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx)));
+            const int16x8_t nnew_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx + row_stride)));
+
+            const int32x4_t nnewval = compute_bilinear_interpolation(nnew_row1, nnew_row2, nw00_new, nw01_new, nw10_new, nw11_new, nshifter_tensor);
+
+            // Calculate It gradient, i.e. pixelwise difference between old and new tensor
+            const int32x4_t diff = vsubq_s32(nnewval, noldval);
+
+            // Load the Ix and Iy gradient computed in the previous stage
+            const int32x4_t nxval = vld1q_s32(bilinear_ix + kx + ky * _window_dimension);
+            const int32x4_t nyval = vld1q_s32(bilinear_iy + kx + ky * _window_dimension);
+
+            // Caculate Ix * It and Iy * It, and accumulate the results
+            nb1 = vmlaq_s32(nb1, diff, nxval);
+            nb2 = vmlaq_s32(nb2, diff, nyval);
+        }
+
+        // Calculate the leftover elements
+        for(; kx < _window_dimension; ++kx)
+        {
+            const int32_t ival = get_pixel<uint8_t>(_input_old, top_left_window_corner_old.x() + kx, top_left_window_corner_old.y() + ky,
+                                                    iw00_old, iw01_old, iw10_old, iw11_old, W_BITS - 5);
+            const int32_t jval = get_pixel<uint8_t>(_input_new, top_left_window_corner_new.x() + kx, top_left_window_corner_new.y() + ky,
+                                                    iw00_new, iw01_new, iw10_new, iw11_new, W_BITS - 5);
+
+            const int32_t diff = jval - ival;
+
+            ib1 += diff * bilinear_ix[kx + ky * _window_dimension];
+            ib2 += diff * bilinear_iy[kx + ky * _window_dimension];
+        }
+    }
+
+    ib1 += vgetq_lane_s32(nb1, 0) + vgetq_lane_s32(nb1, 1) + vgetq_lane_s32(nb1, 2) + vgetq_lane_s32(nb1, 3);
+    ib2 += vgetq_lane_s32(nb2, 0) + vgetq_lane_s32(nb2, 1) + vgetq_lane_s32(nb2, 2) + vgetq_lane_s32(nb2, 3);
+
+    return std::make_pair(ib1, ib2);
+}
+
+NELKTrackerKernel::NELKTrackerKernel()
+    : _input_old(nullptr), _input_new(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _old_points_internal(),
+      _new_points_internal(), _termination(Termination::TERM_CRITERIA_EPSILON), _use_initial_estimate(false), _pyramid_scale(0.0f), _epsilon(0.0f), _num_iterations(0), _window_dimension(0), _level(0),
+      _num_levels(0), _valid_region()
+{
+}
+
+BorderSize NELKTrackerKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NELKTrackerKernel::configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                                  const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                                  INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                                  Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                                  size_t level, size_t num_levels, float pyramid_scale)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_old, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_new, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+
+    _input_old            = input_old;
+    _input_new            = input_new;
+    _old_scharr_gx        = old_scharr_gx;
+    _old_scharr_gy        = old_scharr_gy;
+    _old_points           = old_points;
+    _new_points_estimates = new_points_estimates;
+    _new_points           = new_points;
+    _old_points_internal  = old_points_internal;
+    _new_points_internal  = new_points_internal;
+    _termination          = termination;
+    _use_initial_estimate = use_initial_estimate;
+    _epsilon              = epsilon;
+    _num_iterations       = num_iterations;
+    _window_dimension     = window_dimension;
+    _level                = level;
+    _num_levels           = num_levels;
+    _pyramid_scale        = pyramid_scale;
+    _num_levels           = num_levels;
+
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, old_points->num_values()));
+    window.set(Window::DimY, Window::Dimension(0, 1));
+
+    _valid_region = intersect_valid_regions(
+                        input_old->info()->valid_region(),
+                        input_new->info()->valid_region(),
+                        old_scharr_gx->info()->valid_region(),
+                        old_scharr_gy->info()->valid_region());
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(input_old->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(input_new->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gx->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gy->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)));
+
+    INEKernel::configure(window);
+}
+
+void NELKTrackerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(_input_old->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_input_new->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_old_scharr_gx->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_old_scharr_gy->buffer() == nullptr);
+
+    const int list_end   = window.x().end();
+    const int list_start = window.x().start();
+
+    init_keypoints(list_start, list_end);
+
+    const int buffer_size = _window_dimension * _window_dimension;
+    int       bilinear_ix[buffer_size];
+    int       bilinear_iy[buffer_size];
+
+    const int half_window = _window_dimension / 2;
+
+    auto is_invalid_keypoint = [&](const NELKInternalKeypoint & keypoint)
+    {
+        const int x = std::floor(keypoint.x);
+        const int y = std::floor(keypoint.y);
+
+        return (x - half_window < _valid_region.start(0)) || (x + half_window >= _valid_region.end(0) - 1) || (y - half_window < _valid_region.start(1)) || (y + half_window >= _valid_region.end(1) - 1);
+    };
+
+    for(int list_indx = list_start; list_indx < list_end; ++list_indx)
+    {
+        NELKInternalKeypoint &old_keypoint = _old_points_internal->at(list_indx);
+        NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx);
+
+        if(!old_keypoint.tracking_status)
+        {
+            continue;
+        }
+
+        if(is_invalid_keypoint(old_keypoint))
+        {
+            if(_level == 0)
+            {
+                new_keypoint.tracking_status = false;
+            }
+
+            continue;
+        }
+
+        // Compute spatial gradient matrix
+        int iA11 = 0;
+        int iA12 = 0;
+        int iA22 = 0;
+
+        std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix, bilinear_iy);
+
+        const float A11 = iA11 * FLT_SCALE;
+        const float A12 = iA12 * FLT_SCALE;
+        const float A22 = iA22 * FLT_SCALE;
+
+        // Calculate minimum eigenvalue
+        const float sum_A11_A22  = A11 + A22;
+        const float discriminant = sum_A11_A22 * sum_A11_A22 - 4.0f * (A11 * A22 - A12 * A12);
+        // Divide by _window_dimension^2 to reduce the floating point accummulation error
+        const float minimum_eigenvalue = (sum_A11_A22 - std::sqrt(discriminant)) / (2.0f * _window_dimension * _window_dimension);
+
+        // Determinant
+        const double D = A11 * A22 - A12 * A12;
+
+        // Check if it is a good point to track
+        if(minimum_eigenvalue < EIGENVALUE_THRESHOLD || D < DETERMINANT_THRESHOLD)
+        {
+            // Invalidate tracked point
+            if(_level == 0)
+            {
+                new_keypoint.tracking_status = false;
+            }
+
+            continue;
+        }
+
+        float prev_delta_x = 0.0f;
+        float prev_delta_y = 0.0f;
+
+        for(unsigned int j = 0; j < _num_iterations || _termination == Termination::TERM_CRITERIA_EPSILON; ++j)
+        {
+            if(is_invalid_keypoint(new_keypoint))
+            {
+                if(_level == 0)
+                {
+                    new_keypoint.tracking_status = false;
+                }
+
+                break;
+            }
+
+            // Compute image mismatch vector
+            int ib1 = 0;
+            int ib2 = 0;
+
+            std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix, bilinear_iy);
+
+            double b1 = ib1 * FLT_SCALE;
+            double b2 = ib2 * FLT_SCALE;
+
+            // Compute motion vector -> A^-1 * -b
+            const float delta_x = (A12 * b2 - A22 * b1) / D;
+            const float delta_y = (A12 * b1 - A11 * b2) / D;
+
+            // Update the new position
+            new_keypoint.x += delta_x;
+            new_keypoint.y += delta_y;
+
+            const float mag2 = delta_x * delta_x + delta_y * delta_y;
+
+            // Check if termination criteria is EPSILON and if it is satisfied
+            if(mag2 <= _epsilon && (_termination == Termination::TERM_CRITERIA_EPSILON || _termination == Termination::TERM_CRITERIA_BOTH))
+            {
+                break;
+            }
+
+            // Check convergence analyzing the previous delta
+            if(j > 0 && std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f)
+            {
+                new_keypoint.x -= delta_x * _pyramid_scale;
+                new_keypoint.y -= delta_y * _pyramid_scale;
+                break;
+            }
+
+            prev_delta_x = delta_x;
+            prev_delta_y = delta_y;
+        }
+    }
+
+    if(_level == 0)
+    {
+        for(int list_indx = list_start; list_indx < list_end; ++list_indx)
+        {
+            const NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx);
+
+            _new_points->at(list_indx).x               = roundf(new_keypoint.x);
+            _new_points->at(list_indx).y               = roundf(new_keypoint.y);
+            _new_points->at(list_indx).tracking_status = new_keypoint.tracking_status ? 1 : 0;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..ab84efbf23
--- /dev/null
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+{
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * window.thread_id();
+    const int window_step_x  = 16 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float32x4_t acc0 = vdupq_n_f32(0.f);
+        float32x4_t acc1 = vdupq_n_f32(0.f);
+        float32x4_t acc2 = vdupq_n_f32(0.f);
+        float32x4_t acc3 = vdupq_n_f32(0.f);
+
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        const float *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            float32x2_t a0l = vld1_f32(vec_a);
+
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float a0 = *vec_a;
+
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            acc0 = vmlaq_n_f32(acc0, b00, a0);
+            acc1 = vmlaq_n_f32(acc1, b01, a0);
+            acc2 = vmlaq_n_f32(acc2, b02, a0);
+            acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
+    },
+    ina, out);
+}
+} // namespace
+
+NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    unsigned int num_elems_processed_per_iteration_x = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    vector_matrix_multiply_f32(_input0, _input1, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..a874d219d7
--- /dev/null
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+// Defines for computing atan2
+constexpr float SCALE_FACTOR = 0.7111111111111111f;
+constexpr float PI           = 3.141592653589793f;
+constexpr float SCALE_180    = 180.0f / PI;
+constexpr float SCALE_360    = SCALE_180 * SCALE_FACTOR;
+constexpr float PI_4         = 0.7853981633974483f;
+constexpr float COEFF1       = 0.0663f;
+constexpr float COEFF2       = 0.2447f;
+} // namespace
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline float16x8_t inv(float16x8_t x)
+{
+    const float16x8_t estimate = vrecpeq_f16(x);
+    return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
+}
+
+inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
+{
+    static const float16x8_t one     = vdupq_n_f16(1.0f);
+    static const float16x8_t ninety  = vdupq_n_f16(90.f * SCALE_FACTOR);
+    static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
+    static const float16x8_t piover4 = vdupq_n_f16(PI_4);
+    static const float16x8_t coeff1  = vdupq_n_f16(COEFF1);
+    static const float16x8_t coeff2  = vdupq_n_f16(COEFF2);
+
+    const float16x8_t abs_gx = vabsq_f16(gx);
+    const float16x8_t abs_gy = vabsq_f16(gy);
+    const float16x8_t tmin   = vminq_f16(abs_gx, abs_gy);
+    const float16x8_t tmax   = vmaxq_f16(abs_gx, abs_gy);
+
+    // z = min(x, y) / max(x, y)
+    const float16x8_t z    = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
+    const float16x8_t absz = vabsq_f16(z);
+
+    //                   = x * [pi/4 + (1 - |x|) * (0.2447 + 0.0663 * |x|)]
+    float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
+                                                vsubq_f16(one, absz),
+                                                vfmaq_f16(coeff2, coeff1, absz)));
+
+    // Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
+    arctan = vmulq_f16(arctan, scale);
+
+    /* If z > 1, result = 90 - result */
+    return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
+}
+
+inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
+{
+    static const float16x8_t scale      = vdupq_n_f16(SCALE_360);
+    static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
+    static const float16x8_t zero       = vdupq_n_f16(0.0f);
+    static const float16x8_t oneeighty  = vdupq_n_f16(180.0f * SCALE_FACTOR);
+
+    float16x8_t arctan = atan2_fast(gx, gy, scale);
+
+    // Choose correct quadrant
+    arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
+    arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
+
+    return arctan;
+}
+
+inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
+{
+    static const float16x8_t scale      = vdupq_n_f16(SCALE_180);
+    static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
+    static const float16x8_t oneeighty  = vdupq_n_f16(180.0f * SCALE_FACTOR);
+    static const float16x8_t zero       = vdupq_n_f16(0.0f);
+
+    float16x8_t arctan = atan2_fast(gx, gy, scale);
+
+    // Choose correct quadrant
+    arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
+    arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
+    arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
+
+    return arctan;
+}
+
+inline float32x4_t invsqrtv(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t sqrtv(float32x4_t x)
+{
+    float32x4_t res = vdupq_n_f32(0.5f);
+    return vmlaq_f32(res, x, invsqrtv(x));
+}
+
+inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
+{
+    return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2));
+}
+
+inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
+{
+    const int32x4x2_t square_x =
+    {
+        vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
+        vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
+    };
+
+    const int32x4x2_t square_y =
+    {
+        vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
+        vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
+    };
+
+    const uint32x4x2_t sum =
+    {
+        vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
+        vreinterpretq_u32_s32(square_y.val[0])),
+        vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
+        vreinterpretq_u32_s32(square_y.val[1]))
+    };
+
+    const float32x4x2_t res =
+    {
+        sqrtv(vcvtq_f32_u32(sum.val[0])),
+        sqrtv(vcvtq_f32_u32(sum.val[1]))
+    };
+
+    return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
+                        vqmovn_s32(vcvtq_s32_f32(res.val[1])));
+}
+
+inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
+{
+    static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
+
+    const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
+    const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
+
+    // Compute fast atan2
+    const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
+
+    return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
+}
+
+inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
+{
+    static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
+
+    const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
+    const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
+
+    // Compute fast atan2
+    const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
+
+    return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
+}
+
+template <MagnitudeType mag_type>
+inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
+
+template <>
+inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    const int16x8x2_t mag =
+    {
+        magnitude_l2(in0.val[0], gx.val[0]),
+        magnitude_l2(in0.val[1], gx.val[1])
+    };
+
+    return mag;
+}
+
+template <>
+inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    const int16x8x2_t mag =
+    {
+        magnitude_l1(in0.val[0], gx.val[0]),
+        magnitude_l1(in0.val[1], gx.val[1])
+    };
+
+    return mag;
+}
+
+template <PhaseType phase_type>
+inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
+
+template <>
+inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
+                       phase_signed(in0.val[1], gx.val[1]));
+}
+
+template <>
+inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
+                       phase_unsigned(in0.val[1], gx.val[1]));
+}
+} // namespace fp16
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
+    ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
+
+    const bool run_mag   = magnitude != nullptr;
+    const bool run_phase = phase != nullptr;
+
+    if(run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
+    }
+
+    if(run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(run_mag && run_phase)
+    {
+        /* Run magnitude and phase */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
+    }
+    else if(run_mag)
+    {
+        /* Run magnitude */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
+    }
+    else if(run_phase)
+    {
+        /* Run phase */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
+                              magnitude_access,
+                              phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+
+    magnitude_access.set_valid_region(win, valid_region);
+    phase_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store magnitude
+        const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+    },
+    gx, gy, magnitude);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store phase
+        vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
+    },
+    gx, gy, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store magnitude
+        const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+
+        // Compute and store phase
+        vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
+    },
+    gx, gy, magnitude, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
+#endif
+
+namespace
+{
+inline float32x4_t inv(float32x4_t x)
+{
+    float32x4_t result = vrecpeq_f32(x);
+    result             = vmulq_f32(vrecpsq_f32(x, result), result);
+    return result;
+}
+
+inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
+{
+    const float32x4_t zero       = vdupq_n_f32(0.0f);
+    const float32x4_t epsilon    = vdupq_n_f32(1e-9f);
+    const float32x4_t piover4    = vdupq_n_f32(PI_4);
+    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
+    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
+    const float32x4_t ninety     = vdupq_n_f32(90.0f * SCALE_FACTOR);
+    const float32x4_t oneeighty  = vdupq_n_f32(180.0f * SCALE_FACTOR);
+    const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
+    const float32x4_t scale      = vdupq_n_f32(SCALE_360);
+
+    float32x4_t abs_gx = vabsq_f32(gx);
+    float32x4_t abs_gy = vabsq_f32(gy);
+    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
+    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
+    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
+    float32x4_t absz   = vabsq_f32(z);
+    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
+
+    /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */
+    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
+    result             = vmulq_f32(result, term);
+    result             = vmlaq_f32(result, piover4, z);
+
+    /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]  */
+    result = vmulq_f32(result, scale);
+
+    /* If z > 1, result = 90 - result */
+    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
+
+    /* Choose correct quadrant */
+    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
+    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
+
+    return result;
+}
+
+inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
+{
+    const float32x4_t zero       = vdupq_n_f32(0.0f);
+    const float32x4_t epsilon    = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
+    const float32x4_t piover4    = vdupq_n_f32(PI_4);
+    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
+    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
+    const float32x4_t ninety     = vdupq_n_f32(90.0f);
+    const float32x4_t oneeighty  = vdupq_n_f32(180.0f);
+    const float32x4_t threesixty = vdupq_n_f32(360.0f);
+    const float32x4_t scale      = vdupq_n_f32(SCALE_180);
+
+    float32x4_t abs_gx = vabsq_f32(gx);
+    float32x4_t abs_gy = vabsq_f32(gy);
+    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
+    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
+    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
+    float32x4_t absz   = vabsq_f32(z);
+
+    /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */
+    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
+    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
+    result             = vmulq_f32(result, term);
+    result             = vmlaq_f32(result, piover4, z);
+
+    /* Radians to degrees conversion */
+    result = vmulq_f32(result, scale);
+
+    /* If z > 1, result = 90 - result */
+    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
+
+    /* Choose correct quadrant */
+    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
+    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
+    result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
+
+    return result;
+}
+
+inline float32x4_t invsqrtv(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t sqrtv(float32x4_t x)
+{
+    float32x4_t res = vdupq_n_f32(0.5f);
+    return vmlaq_f32(res, x, invsqrtv(x));
+}
+
+inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
+{
+    const int32x4x2_t square_x =
+    {
+        {
+            vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
+            vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
+        }
+    };
+
+    const int32x4x2_t square_y =
+    {
+        {
+            vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
+            vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
+        }
+    };
+
+    const uint32x4x2_t sum =
+    {
+        {
+            vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
+            vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
+        }
+    };
+
+    const float32x4x2_t res =
+    {
+        {
+            sqrtv(vcvtq_f32_u32(sum.val[0])),
+            sqrtv(vcvtq_f32_u32(sum.val[1]))
+        }
+    };
+
+    return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
+                        vqmovn_s32(vcvtq_s32_f32(res.val[1])));
+}
+
+inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
+{
+    int16x8_t gx_abs = vabsq_s16(input1);
+    int16x8_t gy_abs = vabsq_s16(input2);
+
+    /* Saturating add */
+    return vqaddq_s16(gx_abs, gy_abs);
+}
+
+inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
+{
+    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
+
+    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
+    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
+    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
+    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
+
+    /* Compute fast atan2 */
+    float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
+    float32x4_t angle_low  = atan2_0_360(inputx_f32_low, inputy_f32_low);
+
+    angle_high = vaddq_f32(angle_high, zeropointfive);
+    angle_low  = vaddq_f32(angle_low, zeropointfive);
+
+    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
+                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
+}
+
+inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
+{
+    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
+
+    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
+    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
+    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
+    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
+
+    /* Compute fast atan2 */
+    float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
+    float32x4_t angle_low  = atan2_0_180(inputx_f32_low, inputy_f32_low);
+
+    angle_high = vaddq_f32(angle_high, zeropointfive);
+    angle_low  = vaddq_f32(angle_low, zeropointfive);
+
+    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
+                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
+}
+} // namespace
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
+
+    const bool run_mag   = magnitude != nullptr;
+    const bool run_phase = phase != nullptr;
+
+    if(run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
+    }
+
+    if(run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(run_mag && run_phase)
+    {
+        /* Run magnitude and phase */
+        _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
+    }
+    else
+    {
+        if(run_mag)
+        {
+            /* Run magnitude */
+            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
+        }
+        else if(run_phase)
+        {
+            /* Run phase */
+            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
+                              magnitude_access,
+                              phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+
+    magnitude_access.set_valid_region(win, valid_region);
+    phase_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute magnitude */
+        int16x8x2_t mag{ {} };
+
+        if(MagnitudeType::L2NORM == mag_type)
+        {
+            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
+        }
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+    },
+    gx, gy, magnitude);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute phase */
+        uint8x8x2_t vphase{ {} };
+
+        if(PhaseType::SIGNED == phase_type)
+        {
+            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
+        }
+
+        /* Store phase */
+        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
+    },
+    gx, gy, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute magnitude */
+        int16x8x2_t mag{ {} };
+
+        if(MagnitudeType::L2NORM == mag_type)
+        {
+            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
+        }
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+
+        /* Compute phase */
+        uint8x8x2_t vphase{ {} };
+
+        if(PhaseType::SIGNED == phase_type)
+        {
+            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
+        }
+
+        /* Store phase */
+        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
+    },
+    gx, gy, magnitude, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..4616203d66
--- /dev/null
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <tuple>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+template <bool calc_sum_squared>
+std::pair<uint64x1_t, uint64x1_t> accumulate(const Window &window, Iterator &iterator)
+{
+    uint64x1_t sum         = vdup_n_u64(0);
+    uint64x1_t sum_squared = vdup_n_u64(0);
+
+    // Calculate sum
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t in_data = vld1q_u8(iterator.ptr());
+
+        // Sum of the low and high elements of data
+        const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data));
+        const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0));
+        const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
+
+        // Update sum
+        sum = vpadal_u32(sum, tmp2);
+
+        if(calc_sum_squared)
+        {
+            const uint16x8_t square_data_low  = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data));
+            const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data));
+
+            // Sum of the low and high elements of data
+            const uint32x4_t tmp0_low  = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low));
+            const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high));
+            const uint32x4_t tmp1      = vaddq_u32(tmp0_low, tmp0_high);
+            const uint32x2_t tmp2      = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
+
+            // Update sum
+            sum_squared = vpadal_u32(sum_squared, tmp2);
+        }
+    },
+    iterator);
+
+    return std::make_pair(sum, sum_squared);
+}
+} // namespace
+
+NEMeanStdDevKernel::NEMeanStdDevKernel()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx()
+{
+}
+
+void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == mean);
+    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    _input              = input;
+    _mean               = mean;
+    _stddev             = stddev;
+    _global_sum         = global_sum;
+    _global_sum_squared = global_sum_squared;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMeanStdDevKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+
+    uint64x1_t local_sum         = vdup_n_u64(0);
+    uint64x1_t local_sum_squared = vdup_n_u64(0);
+
+    if(_stddev != nullptr)
+    {
+        std::tie(local_sum, local_sum_squared) = accumulate<true>(window, input);
+    }
+    else
+    {
+        std::tie(local_sum, local_sum_squared) = accumulate<false>(window, input);
+    }
+
+    const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+    // Merge sum and calculate mean and stddev
+    std::unique_lock<std::mutex> lock(_mtx);
+
+    *_global_sum += vget_lane_u64(local_sum, 0);
+
+    const float mean = *_global_sum / num_pixels;
+    *_mean           = mean;
+
+    if(_stddev != nullptr)
+    {
+        const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0);
+        *_global_sum_squared += tmp_sum_squared;
+        *_stddev = std::sqrt((*_global_sum_squared / num_pixels) - (mean * mean));
+    }
+
+    lock.unlock();
+}
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..601a0e109f
--- /dev/null
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void sort(uint8x8_t &a, uint8x8_t &b)
+{
+    const uint8x8_t min = vmin_u8(a, b);
+    const uint8x8_t max = vmax_u8(a, b);
+    a                   = min;
+    b                   = max;
+}
+} // namespace
+
+BorderSize NEMedian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEMedian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+    constexpr int          rect_offset_xy                    = -1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEMedian3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    const unsigned char *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = vget_low_u8(top_data);
+        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+        uint8x8_t p3 = vget_low_u8(mid_data);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p6 = vget_low_u8(bot_data);
+        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+        sort(p1, p2);
+        sort(p4, p5);
+        sort(p7, p8);
+
+        sort(p0, p1);
+        sort(p3, p4);
+        sort(p6, p7);
+
+        sort(p1, p2);
+        sort(p4, p5);
+        sort(p7, p8);
+
+        sort(p0, p3);
+        sort(p5, p8);
+        sort(p4, p7);
+
+        sort(p3, p6);
+        sort(p1, p4);
+        sort(p2, p5);
+
+        sort(p4, p7);
+        sort(p4, p2);
+        sort(p6, p4);
+
+        sort(p4, p2);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..b188614752
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <climits>
+#include <cstddef>
+
+namespace arm_compute
+{
+NEMinMaxKernel::NEMinMaxKernel()
+    : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
+{
+}
+
+void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _input = input;
+    _min   = min;
+    _max   = max;
+
+    switch(input->info()->format())
+    {
+        case Format::U8:
+            _min_init = UCHAR_MAX;
+            _max_init = 0;
+            _func     = &NEMinMaxKernel::minmax_U8;
+            break;
+        case Format::S16:
+            _min_init = SHRT_MAX;
+            _max_init = SHRT_MIN;
+            _func     = &NEMinMaxKernel::minmax_S16;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMinMaxKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEMinMaxKernel::reset()
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    *_min = _min_init;
+    *_max = _max_init;
+}
+
+template <typename T>
+void NEMinMaxKernel::update_min_max(const T min, const T max)
+{
+    std::lock_guard<std::mutex> lock(_mtx);
+
+    if(min < *_min)
+    {
+        *_min = min;
+    }
+
+    if(max > *_max)
+    {
+        *_max = max;
+    }
+}
+
+void NEMinMaxKernel::minmax_U8(const Window &win)
+{
+    uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
+    uint8x8_t carry_max = vdup_n_u8(0);
+
+    Iterator input(_input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t pixels  = vld1q_u8(input.ptr());
+        const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+        const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+        carry_min                = vmin_u8(tmp_min, carry_min);
+        carry_max                = vmax_u8(tmp_max, carry_max);
+    },
+    input);
+
+    // Reduce result
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+
+    // Extract max/min values
+    const uint8_t min_i = vget_lane_u8(carry_min, 0);
+    const uint8_t max_i = vget_lane_u8(carry_max, 0);
+
+    // Perform reduction of local min/max values
+    update_min_max(min_i, max_i);
+}
+
+void NEMinMaxKernel::minmax_S16(const Window &win)
+{
+    int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
+    int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
+
+    Iterator input(_input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto        in_ptr   = reinterpret_cast<const int16_t *>(input.ptr());
+        const int16x8x2_t pixels   = vld2q_s16(in_ptr);
+        const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
+        const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
+        const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
+        const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
+        carry_min                  = vmin_s16(tmp_min2, carry_min);
+        carry_max                  = vmax_s16(tmp_max2, carry_max);
+    },
+    input);
+
+    // Reduce result
+    carry_min = vpmin_s16(carry_min, carry_min);
+    carry_max = vpmax_s16(carry_max, carry_max);
+    carry_min = vpmin_s16(carry_min, carry_min);
+    carry_max = vpmax_s16(carry_max, carry_max);
+
+    // Extract max/min values
+    const int16_t min_i = vget_lane_s16(carry_min, 0);
+    const int16_t max_i = vget_lane_s16(carry_max, 0);
+
+    // Perform reduction of local min/max values
+    update_min_max(min_i, max_i);
+}
+
+NEMinMaxLocationKernel::NEMinMaxLocationKernel()
+    : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr), _num_elems_processed_per_iteration(0)
+{
+}
+
+bool NEMinMaxLocationKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <unsigned int...>
+struct index_seq
+{
+    index_seq()                  = default;
+    index_seq(const index_seq &) = default;
+    index_seq &operator=(const index_seq &) = default;
+    index_seq(index_seq &&) noexcept        = default;
+    index_seq &operator=(index_seq &&) noexcept = default;
+    virtual ~index_seq()                        = default;
+};
+template <unsigned int N, unsigned int... S>
+struct gen_index_seq : gen_index_seq < N - 1, N - 1, S... >
+{
+};
+template <unsigned int... S>
+struct gen_index_seq<0u, S...> : index_seq<S...>
+{
+    using type = index_seq<S...>;
+};
+
+template <class T, unsigned int... N>
+struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
+{
+    static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
+};
+
+template <class T, unsigned int... N>
+const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>::func_table[sizeof...(N)] =
+{
+    &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
+};
+
+void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
+                                       ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
+                                       uint32_t *min_count, uint32_t *max_count)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _input     = input;
+    _min       = min;
+    _max       = max;
+    _min_count = min_count;
+    _max_count = max_count;
+    _min_loc   = min_loc;
+    _max_loc   = max_loc;
+
+    unsigned int count_min = (nullptr != min_count ? 1 : 0);
+    unsigned int count_max = (nullptr != max_count ? 1 : 0);
+    unsigned int loc_min   = (nullptr != min_loc ? 1 : 0);
+    unsigned int loc_max   = (nullptr != max_loc ? 1 : 0);
+
+    unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
+
+    switch(input->info()->format())
+    {
+        case Format::U8:
+            _func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
+            break;
+        case Format::S16:
+            _func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            break;
+    }
+
+    _num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, _num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMinMaxLocationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+void NEMinMaxLocationKernel::minmax_loc(const Window &win)
+{
+    if(count_min || count_max || loc_min || loc_max)
+    {
+        Iterator input(_input, win);
+
+        size_t       min_count = 0;
+        size_t       max_count = 0;
+        unsigned int step      = _num_elems_processed_per_iteration;
+
+        // Clear min location array
+        if(loc_min)
+        {
+            _min_loc->clear();
+        }
+
+        // Clear max location array
+        if(loc_max)
+        {
+            _max_loc->clear();
+        }
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            auto    in_ptr = reinterpret_cast<const T *>(input.ptr());
+            int32_t idx    = id.x();
+            int32_t idy    = id.y();
+
+            for(unsigned int i = 0; i < step; ++i)
+            {
+                const T       pixel = *in_ptr++;
+                Coordinates2D p{ idx++, idy };
+
+                if(count_min || loc_min)
+                {
+                    if(*_min == pixel)
+                    {
+                        if(count_min)
+                        {
+                            ++min_count;
+                        }
+
+                        if(loc_min)
+                        {
+                            _min_loc->push_back(p);
+                        }
+                    }
+                }
+
+                if(count_max || loc_max)
+                {
+                    if(*_max == pixel)
+                    {
+                        if(count_max)
+                        {
+                            ++max_count;
+                        }
+
+                        if(loc_max)
+                        {
+                            _max_loc->push_back(p);
+                        }
+                    }
+                }
+            }
+        },
+        input);
+
+        if(count_min)
+        {
+            *_min_count = min_count;
+        }
+
+        if(count_max)
+        {
+            *_max_count = max_count;
+        }
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..03d1409be1
--- /dev/null
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+const uint8x16_t zero_u8 = vdupq_n_u8(0);
+
+template <size_t columns>
+inline uint8x8_t min_row(uint8x16_t row_data)
+{
+    uint8x8_t min = vget_low_u8(row_data);
+
+    for(size_t c = 1; c < columns; ++c)
+    {
+        row_data = vextq_u8(row_data, zero_u8, 1);
+        min      = vmin_u8(min, vget_low_u8(row_data));
+    }
+
+    return min;
+}
+
+template <size_t columns>
+inline uint8x8_t max_row(uint8x16_t row_data)
+{
+    uint8x8_t max = vget_low_u8(row_data);
+
+    for(size_t c = 1; c < columns; ++c)
+    {
+        row_data = vextq_u8(row_data, zero_u8, 1);
+        max      = vmax_u8(max, vget_low_u8(row_data));
+    }
+
+    return max;
+}
+
+inline void sort(uint8x8_t &a, uint8x8_t &b)
+{
+    const uint8x8_t min = vmin_u8(a, b);
+    const uint8x8_t max = vmax_u8(a, b);
+    a                   = min;
+    b                   = max;
+}
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+// Calculations that do not affect the median were removed.
+inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
+{
+    sort(p0, p1);
+    sort(p2, p3);
+    sort(p0, p2);
+    sort(p1, p3);
+    sort(p1, p2);
+    sort(p0, p4);
+    sort(p1, p4);
+    sort(p2, p4);
+}
+
+inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
+                  uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
+                  uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
+{
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p1);
+    sort(p3, p4);
+    sort(p6, p7);
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p3);
+    sort(p5, p8);
+    sort(p4, p7);
+    sort(p3, p6);
+    sort(p1, p4);
+    sort(p2, p5);
+    sort(p4, p7);
+    sort(p4, p2);
+    sort(p6, p4);
+    sort(p4, p2);
+}
+
+inline void sort21(uint8x8_t p[21])
+{
+    sort(p[0], p[1]);
+    sort(p[2], p[3]);
+    sort(p[4], p[5]);
+    sort(p[6], p[7]);
+    sort(p[8], p[9]);
+    sort(p[10], p[11]);
+    sort(p[12], p[13]);
+    sort(p[14], p[15]);
+    sort(p[16], p[17]);
+    sort(p[18], p[19]);
+    sort(p[0], p[2]);
+    sort(p[1], p[3]);
+    sort(p[4], p[6]);
+    sort(p[5], p[7]);
+    sort(p[8], p[10]);
+    sort(p[9], p[11]);
+    sort(p[12], p[14]);
+    sort(p[13], p[15]);
+    sort(p[16], p[18]);
+    sort(p[17], p[19]);
+    sort(p[1], p[2]);
+    sort(p[5], p[6]);
+    sort(p[0], p[4]);
+    sort(p[3], p[7]);
+    sort(p[9], p[10]);
+    sort(p[13], p[14]);
+    sort(p[8], p[12]);
+    sort(p[11], p[15]);
+    sort(p[17], p[18]);
+    sort(p[16], p[20]);
+    sort(p[1], p[5]);
+    sort(p[2], p[6]);
+    sort(p[9], p[13]);
+    sort(p[10], p[14]);
+    sort(p[0], p[8]);
+    sort(p[7], p[15]);
+    sort(p[17], p[20]);
+    sort(p[1], p[4]);
+    sort(p[3], p[6]);
+    sort(p[9], p[12]);
+    sort(p[11], p[14]);
+    sort(p[18], p[20]);
+    sort(p[0], p[16]);
+    sort(p[2], p[4]);
+    sort(p[3], p[5]);
+    sort(p[10], p[12]);
+    sort(p[11], p[13]);
+    sort(p[1], p[9]);
+    sort(p[6], p[14]);
+    sort(p[19], p[20]);
+    sort(p[3], p[4]);
+    sort(p[11], p[12]);
+    sort(p[1], p[8]);
+    sort(p[2], p[10]);
+    sort(p[5], p[13]);
+    sort(p[7], p[14]);
+    sort(p[3], p[11]);
+    sort(p[2], p[8]);
+    sort(p[4], p[12]);
+    sort(p[7], p[13]);
+    sort(p[1], p[17]);
+    sort(p[3], p[10]);
+    sort(p[5], p[12]);
+    sort(p[1], p[16]);
+    sort(p[2], p[18]);
+    sort(p[3], p[9]);
+    sort(p[6], p[12]);
+    sort(p[2], p[16]);
+    sort(p[3], p[8]);
+    sort(p[7], p[12]);
+    sort(p[5], p[9]);
+    sort(p[6], p[10]);
+    sort(p[4], p[8]);
+    sort(p[7], p[11]);
+    sort(p[3], p[19]);
+    sort(p[5], p[8]);
+    sort(p[7], p[10]);
+    sort(p[3], p[18]);
+    sort(p[4], p[20]);
+    sort(p[6], p[8]);
+    sort(p[7], p[9]);
+    sort(p[3], p[17]);
+    sort(p[5], p[20]);
+    sort(p[7], p[8]);
+    sort(p[3], p[16]);
+    sort(p[6], p[20]);
+    sort(p[5], p[17]);
+    sort(p[7], p[20]);
+    sort(p[4], p[16]);
+    sort(p[6], p[18]);
+    sort(p[5], p[16]);
+    sort(p[7], p[19]);
+    sort(p[7], p[18]);
+    sort(p[6], p[16]);
+    sort(p[7], p[17]);
+    sort(p[10], p[18]);
+    sort(p[7], p[16]);
+    sort(p[9], p[17]);
+    sort(p[8], p[16]);
+    sort(p[9], p[16]);
+    sort(p[10], p[16]);
+}
+
+inline void sort25(uint8x8_t p[25])
+{
+    sort(p[1], p[2]);
+    sort(p[0], p[1]);
+    sort(p[1], p[2]);
+    sort(p[4], p[5]);
+    sort(p[3], p[4]);
+    sort(p[4], p[5]);
+    sort(p[0], p[3]);
+    sort(p[2], p[5]);
+    sort(p[2], p[3]);
+    sort(p[1], p[4]);
+    sort(p[1], p[2]);
+    sort(p[3], p[4]);
+    sort(p[7], p[8]);
+    sort(p[6], p[7]);
+    sort(p[7], p[8]);
+    sort(p[10], p[11]);
+    sort(p[9], p[10]);
+    sort(p[10], p[11]);
+    sort(p[6], p[9]);
+    sort(p[8], p[11]);
+    sort(p[8], p[9]);
+    sort(p[7], p[10]);
+    sort(p[7], p[8]);
+    sort(p[9], p[10]);
+    sort(p[0], p[6]);
+    sort(p[4], p[10]);
+    sort(p[4], p[6]);
+    sort(p[2], p[8]);
+    sort(p[2], p[4]);
+    sort(p[6], p[8]);
+    sort(p[1], p[7]);
+    sort(p[5], p[11]);
+    sort(p[5], p[7]);
+    sort(p[3], p[9]);
+    sort(p[3], p[5]);
+    sort(p[7], p[9]);
+    sort(p[1], p[2]);
+    sort(p[3], p[4]);
+    sort(p[5], p[6]);
+    sort(p[7], p[8]);
+    sort(p[9], p[10]);
+    sort(p[13], p[14]);
+    sort(p[12], p[13]);
+    sort(p[13], p[14]);
+    sort(p[16], p[17]);
+    sort(p[15], p[16]);
+    sort(p[16], p[17]);
+    sort(p[12], p[15]);
+    sort(p[14], p[17]);
+    sort(p[14], p[15]);
+    sort(p[13], p[16]);
+    sort(p[13], p[14]);
+    sort(p[15], p[16]);
+    sort(p[19], p[20]);
+    sort(p[18], p[19]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[21], p[23]);
+    sort(p[22], p[24]);
+    sort(p[22], p[23]);
+    sort(p[18], p[21]);
+    sort(p[20], p[23]);
+    sort(p[20], p[21]);
+    sort(p[19], p[22]);
+    sort(p[22], p[24]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[12], p[18]);
+    sort(p[16], p[22]);
+    sort(p[16], p[18]);
+    sort(p[14], p[20]);
+    sort(p[20], p[24]);
+    sort(p[14], p[16]);
+    sort(p[18], p[20]);
+    sort(p[22], p[24]);
+    sort(p[13], p[19]);
+    sort(p[17], p[23]);
+    sort(p[17], p[19]);
+    sort(p[15], p[21]);
+    sort(p[15], p[17]);
+    sort(p[19], p[21]);
+    sort(p[13], p[14]);
+    sort(p[15], p[16]);
+    sort(p[17], p[18]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[0], p[12]);
+    sort(p[8], p[20]);
+    sort(p[8], p[12]);
+    sort(p[4], p[16]);
+    sort(p[16], p[24]);
+    sort(p[12], p[16]);
+    sort(p[2], p[14]);
+    sort(p[10], p[22]);
+    sort(p[10], p[14]);
+    sort(p[6], p[18]);
+    sort(p[6], p[10]);
+    sort(p[10], p[12]);
+    sort(p[1], p[13]);
+    sort(p[9], p[21]);
+    sort(p[9], p[13]);
+    sort(p[5], p[17]);
+    sort(p[13], p[17]);
+    sort(p[3], p[15]);
+    sort(p[11], p[23]);
+    sort(p[11], p[15]);
+    sort(p[7], p[19]);
+    sort(p[7], p[11]);
+    sort(p[11], p[13]);
+    sort(p[11], p[12]);
+}
+} // namespace
+
+NENonLinearFilterKernel::NENonLinearFilterKernel()
+    : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
+{
+}
+
+BorderSize NENonLinearFilterKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                        bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
+    ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
+
+    // Set class variables
+    _border_size = BorderSize(mask_size / 2);
+    _input       = input;
+    _output      = output;
+    _mask        = mask;
+    _pattern     = pattern;
+    _function    = function;
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+
+    Window                 win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
+                              output_access);
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+
+    // Define function index
+    _func_idx = (3 == mask_size) ? 0 : 1;
+
+    if(MatrixPattern::OTHER != pattern)
+    {
+        _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
+    }
+}
+
+void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
+{
+    unsigned int v = 0;
+
+    for(int r = 0; r < rows; ++r)
+    {
+        for(int c = 0; c < cols; ++c, ++v)
+        {
+            uint8_t val = 0;
+
+            switch(pattern)
+            {
+                case MatrixPattern::BOX:
+                    val = 255;
+                    break;
+                case MatrixPattern::CROSS:
+                    val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
+                    break;
+                case MatrixPattern::DISK:
+                    val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
+                            (cols / 2.0f))) <= 1.0f ? 255 : 0;
+                    break;
+                default:
+                    return;
+            }
+
+            mask[v] = val;
+        }
+    }
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
+    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
+    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = vget_low_u8(top_data);
+        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+        uint8x8_t p3 = vget_low_u8(mid_data);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p6 = vget_low_u8(bot_data);
+        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
+template <>
+void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[25];
+        for(unsigned int i = 0; i < 5; ++i)
+        {
+            const unsigned int idx_d = i * 2;
+            const unsigned int idx_p = i * 5;
+
+            p[idx_p]     = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        sort25(p);
+
+        vst1_u8(output.ptr(), p[12]);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::min_filter_box(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        // Get min of rows
+        uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
+
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            rows_min              = vminq_u8(rows_min, data);
+        }
+
+        const uint8x8_t out = min_row<mask_w>(rows_min);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::max_filter_box(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
+
+        // Get max of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            rows_max              = vmaxq_u8(rows_max, data);
+        }
+
+        // Get max of columns
+        const uint8x8_t out = max_row<mask_w>(rows_max);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
+    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
+    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x8_t  top_data = vld1_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x8_t  bot_data = vld1_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = top_data;
+        uint8x8_t p1 = vget_low_u8(mid_data);
+        uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p4 = bot_data;
+
+        sort5(p0, p1, p2, p3, p4);
+
+        vst1_u8(output.ptr(), p2);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x8_t  top2_data = vld1_u8(input_top2_ptr + input.offset());
+        const uint8x8_t  top_data  = vld1_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x8_t  bot_data  = vld1_u8(input_bot_ptr + input.offset());
+        const uint8x8_t  bot2_data = vld1_u8(input_bot2_ptr + input.offset());
+
+        uint8x8_t p0 = top2_data;
+        uint8x8_t p1 = top_data;
+        uint8x8_t p2 = vget_low_u8(mid_data);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
+        uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
+        uint8x8_t p7 = bot_data;
+        uint8x8_t p8 = bot2_data;
+
+        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::min_filter_cross(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
+
+        // Get min of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
+            rows_min             = vmin_u8(rows_min, data);
+        }
+
+        // Get min of middle row
+        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
+        uint8x8_t        out  = min_row<mask_w>(data);
+
+        // Get final min
+        out = vmin_u8(out, rows_min);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::max_filter_cross(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
+
+    // Set row pointers
+    std::array<unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
+
+        // Get max of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
+            rows_max             = vmax_u8(rows_max, data);
+        }
+
+        // Get max of middle row
+        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
+        uint8x8_t        out  = max_row<mask_w>(data);
+
+        // Get final max
+        out = vmax_u8(out, rows_max);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[21];
+        p[0]  = d[0];
+        p[1]  = vext_u8(d[0], d[1], 1);
+        p[2]  = vext_u8(d[0], d[1], 2);
+        p[18] = d[8];
+        p[19] = vext_u8(d[8], d[9], 1);
+        p[20] = vext_u8(d[8], d[9], 2);
+
+        for(unsigned int i = 0; i < 3; ++i)
+        {
+            const unsigned int idx_d = 2 + i * 2;
+            const unsigned int idx_p = 3 + i * 5;
+
+            p[idx_p]     = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        sort21(p);
+
+        vst1_u8(output.ptr(), p[10]);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
+        uint8x16_t       rows_min_5 = vminq_u8(top_data, bot_data);
+        rows_min_5                  = vminq_u8(rows_min_5, mid_data);
+
+        const uint8x8_t out_3 = min_row<3>(rows_min_3);
+        const uint8x8_t out_5 = min_row<5>(rows_min_5);
+
+        vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
+        uint8x16_t       rows_max_5 = vmaxq_u8(top_data, bot_data);
+        rows_max_5                  = vmaxq_u8(rows_max_5, mid_data);
+
+        const uint8x8_t out_3 = max_row<3>(rows_max_3);
+        const uint8x8_t out_5 = max_row<5>(rows_max_5);
+
+        vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    const int     k_row_half = mask_h / 2;
+    const int     k_col_half = mask_w / 2;
+    constexpr int mask_size  = mask_w * mask_h;
+
+    // Set row pointers
+    std::array<unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        std::array<uint8_t, mask_size> vals{ {} };
+
+        size_t v = 0;
+        size_t m = 0;
+
+        for(unsigned int r = 0; r < mask_h; ++r)
+        {
+            const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
+
+            for(unsigned int c = 0; c < mask_w; ++c, ++m)
+            {
+                if(_mask[m] == 255)
+                {
+                    vals[v] = in_ptr[c];
+                    ++v;
+                }
+            }
+        }
+
+        // Only do something if there is at least one non-zero element in the
+        // mask
+        if(v > 0)
+        {
+            std::sort(vals.begin(), vals.begin() + v);
+
+            switch(_function)
+            {
+                case NonLinearFilterFunction::MIN:
+                    *output.ptr() = vals[0];
+                    break;
+                case NonLinearFilterFunction::MAX:
+                    *output.ptr() = vals[v - 1];
+                    break;
+                case NonLinearFilterFunction::MEDIAN:
+                    *output.ptr() = vals[v / 2];
+                    break;
+                default:
+                    break;
+            }
+        }
+    },
+    input, output);
+}
+
+void NENonLinearFilterKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window);
+
+    // Function table for BOX pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_box =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_box<3, 3>,
+            &NENonLinearFilterKernel::min_filter_box<3, 3>,
+            &NENonLinearFilterKernel::max_filter_box<3, 3>,
+            &NENonLinearFilterKernel::median_filter_box<5, 5>,
+            &NENonLinearFilterKernel::min_filter_box<5, 5>,
+            &NENonLinearFilterKernel::max_filter_box<5, 5>,
+        }
+    };
+
+    // Function table for CROSS pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_cross =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::min_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::max_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::median_filter_cross<5, 5>,
+            &NENonLinearFilterKernel::min_filter_cross<5, 5>,
+            &NENonLinearFilterKernel::max_filter_cross<5, 5>,
+        }
+    };
+
+    // Function table for DISK pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_disk =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_box<3, 3>,
+            &NENonLinearFilterKernel::min_filter_box<3, 3>,
+            &NENonLinearFilterKernel::max_filter_box<3, 3>,
+            &NENonLinearFilterKernel::median_filter_disk<5, 5>,
+            &NENonLinearFilterKernel::min_filter_disk<5, 5>,
+            &NENonLinearFilterKernel::max_filter_disk<5, 5>,
+        }
+    };
+
+    // Function table for OTHER pattern
+    static const std::array<NonLinearFilterFunction, 2> func_table_generic =
+    {
+        {
+            &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
+            &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
+        }
+    };
+
+    switch(_pattern)
+    {
+        case MatrixPattern::BOX:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
+            (this->*func_table_box[_func_idx])(window);
+            break;
+        case MatrixPattern::CROSS:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
+            (this->*func_table_cross[_func_idx])(window);
+            break;
+        case MatrixPattern::DISK:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
+            (this->*func_table_disk[_func_idx])(window);
+            break;
+        case MatrixPattern::OTHER:
+        default:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
+            (this->*func_table_generic[_func_idx])(window);
+            break;
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..1826c474f7
--- /dev/null
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
+    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc >= nc.val[0], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgtq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
+{
+    auto       in  = static_cast<const float *__restrict>(in_ptr) - 1;
+    const auto out = static_cast<float *__restrict>(out_ptr);
+
+    // Get centre scores
+    const float16x8x2_t vc =
+    {
+        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
+        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
+    };
+
+    // Neighboring pixels
+    in -= in_stride;
+
+    static const float16x4_t  zero_f16x4 = vdup_n_f16(0);
+    static const uint16x8_t   zero_u16   = vdupq_n_u16(0);
+    static const uint16x8_t   true_mask  = vceqq_u16(zero_u16, zero_u16);
+    static const uint16x8x2_t true_mask_x2 =
+    {
+        true_mask,
+        true_mask
+    };
+
+    uint16x8x2_t mask = true_mask_x2;
+
+    // Top row
+    const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
+    mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
+    mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
+
+    in += in_stride;
+
+    // Middle row
+    const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc >= nc.val[0], vc > nc.val[2]
+    mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
+    mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
+
+    in += in_stride;
+
+    // Bottom row
+    const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
+    mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
+
+    // Store
+    static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
+
+    const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
+    vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
+    vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
+
+    const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
+    vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
+    vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
+}
+
+inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
+{
+    auto       in  = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
+    const auto out = static_cast<uint8_t *__restrict>(out_ptr);
+
+    // Get centre scores
+    const uint8x16_t vc = vld1q_u8(in + 1);
+
+    // Neighboring pixels
+    in -= in_stride;
+
+    // Top row
+    const uint8x16_t l_nc_0 = vld1q_u8(in);
+    const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
+    const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
+
+    // Keep center scores if ...
+    // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
+    uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
+    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
+    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
+
+    in += in_stride;
+
+    // Middle row
+    const uint8x16_t l_nc_1 = vld1q_u8(in);
+    const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
+
+    // ... and ...
+    // vc >= l_nc_1, vc > r_nc_1
+    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
+
+    in += in_stride;
+
+    // Bottom row
+    const uint8x16_t l_nc_2 = vld1q_u8(in);
+    const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
+    const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
+
+    // ... and ...
+    // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
+    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
+    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
+
+    // Store
+    static const uint8x16_t zero = vdupq_n_u8(0);
+    vst1q_u8(out, vbslq_u8(mask, vc, zero));
+}
+} // namespace fp16
+
+void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            _func = &fp16::non_maxima_suppression3x3_U8_U8;
+            break;
+        default:
+            _func = &fp16::non_maxima_suppression3x3_F32_F32;
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+#endif
+
+namespace
+{
+inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
+{
+    auto       input  = static_cast<const float *__restrict>(input_ptr) - 1;
+    const auto output = static_cast<float *__restrict>(output_ptr);
+
+    // Get centre scores
+    const float32x4x4_t vc =
+    {
+        {
+            vld1q_f32(input + 1),
+            vld1q_f32(input + 5),
+            vld1q_f32(input + 9),
+            vld1q_f32(input + 13)
+        }
+    };
+
+    // Neighboring pixels
+    float32x4x4_t l_nc{ {} };
+    float32x4x4_t m_nc{ {} };
+    float32x4x4_t r_nc{ {} };
+
+    input -= input_stride;
+
+    // Row0 - Low part
+    float32x4_t tmp_low   = vld1q_f32(input);
+    float32x4_t tmp_high  = vld1q_f32(input + 4);
+    float32x4_t tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row0 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
+    uint32x4x4_t mask{ {} };
+    mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
+
+    input += input_stride;
+
+    // Row1 - Low part
+    tmp_low   = vld1q_f32(input);
+    tmp_high  = vld1q_f32(input + 4);
+    tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row1 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc >= nc.val[0], mc > nc.val[2]
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
+
+    input += input_stride;
+
+    // Row2 - Low part
+    tmp_low   = vld1q_f32(input);
+    tmp_high  = vld1q_f32(input + 4);
+    tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row2 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
+
+    static const float32x4_t zero = vdupq_n_f32(0.f);
+
+    // Store
+    vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
+    vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
+    vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
+    vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
+}
+
+inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
+{
+    auto       input  = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
+    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get centre scores
+    const uint8x16_t vc = vld1q_u8(input + 1);
+
+    // Neighboring pixels
+    uint8x16_t l_nc{};
+    uint8x16_t m_nc{};
+    uint8x16_t r_nc{};
+
+    input -= input_stride;
+
+    // Row0
+    l_nc = vld1q_u8(input);
+    m_nc = vld1q_u8(input + 1);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc >= l_nc, mc >= m_nc, mc >= r_nc
+    uint8x16_t mask = vcgeq_u8(vc, l_nc);
+    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc));
+    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc));
+
+    input += input_stride;
+
+    // Row1
+    l_nc = vld1q_u8(input);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc >= l_nc, mc > r_nc
+    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
+
+    input += input_stride;
+
+    // Row2
+    l_nc = vld1q_u8(input);
+    m_nc = vld1q_u8(input + 1);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc > l_nc, mc > m_nc, mc > r_nc
+    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
+
+    static const uint8x16_t zero = vdupq_n_u8(0);
+
+    // Store
+    vst1q_u8(output, vbslq_u8(mask, vc, zero));
+}
+} // namespace
+
+NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    if(input->info()->data_type() == DataType::U8)
+    {
+        _func = &non_maxima_suppression3x3_U8_U8;
+    }
+    else
+    {
+        _func = &non_maxima_suppression3x3_FLOAT_FLOAT;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NENonMaximaSuppression3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        _func(input.ptr(), output.ptr(), input_stride);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..a971dc8d97
--- /dev/null
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NENormalizationLayerKernel::NENormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
+{
+}
+
+BorderSize NENormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+
+    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
+
+    _input         = input;
+    _input_squared = input_squared;
+    _output        = output;
+    _norm_info     = norm_info;
+    _border_size   = BorderSize(0, border_width);
+
+    const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+
+    switch(norm_info.type())
+    {
+        case NormType::IN_MAP_1D:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+            break;
+        case NormType::IN_MAP_2D:
+            // Normalize over X and Y
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+            break;
+        case NormType::CROSS_MAP:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowRectangle  input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, input_squared_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator input_squared(_input_squared, window);
+    Iterator output(_output, window);
+
+    const int dim_y                = 1;
+    const int radius               = _norm_info.norm_size() / 2;
+    const int total_size           = _input->info()->dimension(dim) - 1;
+    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+    const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
+    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        float32x4_t accu = vdupq_n_f32(0.f);
+        for(int j = first_row; j <= last_row; j++)
+        {
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+            }
+        }
+
+        // Normalize
+        const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
+}
+
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator input_squared(_input_squared, window);
+    Iterator output(_output, window);
+
+    const int dim_y                = 1;
+    const int radius               = _norm_info.norm_size() / 2;
+    const int total_size           = _input->info()->dimension(dim) - 1;
+    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+    const int fixed_point_position = _input->info()->fixed_point_position();
+
+    const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+    const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+    const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        qint8x16_t accu = vdupq_n_qs8(0);
+        for(int j = first_row; j <= last_row; ++j)
+        {
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+            }
+        }
+
+        // Normalize
+        const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+        const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+        const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
+}
+
+void NENormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    // Run function
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..aa8c7a1847
--- /dev/null
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+
+#include <arm_neon.h>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+const float       scale255_constant      = 1.f / 255.f;
+const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
+const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
+
+/* Scales a given vector by 1/255.
+ *
+ * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
+ *
+ * @param in Input vector to scale.
+ * @return   Scaled output rounded to nearest (round half up).
+ */
+inline int32x4_t scale255_S32_S32(int32x4_t in)
+{
+    // Scale
+    const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
+    // Round to nearest (round half up)
+    // Add +0.5 for all values
+    // Afterwards vcvt rounds toward zero
+    return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
+}
+
+inline uint16x8_t scale255_U16_U16(uint16x8_t in)
+{
+    const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
+    const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
+    return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
+
+    const uint8x16_t ta1 = vld1q_u8(input1);
+    const uint8x16_t ta2 = vld1q_u8(input2);
+
+    uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+    const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+    uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+    const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+
+    tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+    tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+
+    if(is_scale255)
+    {
+        tmp1_high = scale255_U16_U16(tmp1_high);
+        tmp1_low  = scale255_U16_U16(tmp1_low);
+    }
+    else
+    {
+        const int16x8_t vn = vdupq_n_s16(-n);
+
+        if(is_sat)
+        {
+            tmp1_high = vqshlq_u16(tmp1_high, vn);
+            tmp1_low  = vqshlq_u16(tmp1_low, vn);
+        }
+        else
+        {
+            tmp1_high = vshlq_u16(tmp1_high, vn);
+            tmp1_low  = vshlq_u16(tmp1_low, vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+    }
+    else
+    {
+        vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
+    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
+    ARM_COMPUTE_UNUSED(n);
+
+    const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<qint8_t *__restrict>(output_ptr);
+
+    const qint8x16_t ta1 = vld1q_qs8(input1);
+    const qint8x16_t ta2 = vld1q_qs8(input2);
+
+    qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+
+    vst1q_s8(output, res);
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
+{
+    int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(input1));
+    const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
+    int32x4_t       tmp1_low  = vmovl_s16(vget_low_s16(input1));
+    const int32x4_t tmp2_low  = vmovl_s16(vget_low_s16(input2));
+
+    tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
+    tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
+
+    if(is_scale255)
+    {
+        tmp1_high = scale255_S32_S32(tmp1_high);
+        tmp1_low  = scale255_S32_S32(tmp1_low);
+    }
+    else
+    {
+        // Right shift amount
+        const int32x4_t vn = vdupq_n_s32(-n);
+        // Left shift amount
+        const int32x4_t vnl = vdupq_n_s32(n);
+        // Calculate conversion bit
+        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
+        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
+        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
+        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
+        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
+        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
+        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
+        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
+        if(is_sat)
+        {
+            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+        else
+        {
+            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
+    }
+    else
+    {
+        return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
+{
+    const int16x8x2_t result =
+    {
+        {
+            // First 8 elements
+            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
+            // Second 8 elements
+            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
+        }
+    };
+
+    return result;
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const int16_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const int16x8x2_t ta1    = vld2q_s16(input1);
+    const int16x8x2_t ta2    = vld2q_s16(input2);
+    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+    vst2q_s16(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
+{
+    const auto input1 = static_cast<const float *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const float *__restrict>(input2_ptr);
+    const auto output = static_cast<float *__restrict>(output_ptr);
+
+    const float32x4x4_t ta1       = vld4q_f32(input1);
+    const float32x4x4_t ta2       = vld4q_f32(input2);
+    const float32x4_t   scale_vec = vdupq_n_f32(scale);
+    const float32x4x4_t result =
+    {
+        {
+            vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec)
+        }
+    };
+    vst4q_f32(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const uint8x16_t bv = vld1q_u8(input2);
+    const uint8x16_t av = vld1q_u8(input1);
+
+    uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+    uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+    tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+    tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+    if(is_scale255)
+    {
+        tmp_low  = scale255_U16_U16(tmp_low);
+        tmp_high = scale255_U16_U16(tmp_high);
+    }
+    else
+    {
+        const int16x8_t vn = vdupq_n_s16(-n);
+
+        if(is_sat)
+        {
+            tmp_low  = vqshlq_u16(tmp_low, vn);
+            tmp_high = vqshlq_u16(tmp_high, vn);
+        }
+        else
+        {
+            tmp_low  = vshlq_u16(tmp_low, vn);
+            tmp_high = vshlq_u16(tmp_high, vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+
+        tmp_low  = vminq_u16(tmp_low, max);
+        tmp_high = vminq_u16(tmp_high, max);
+    }
+
+    vst1q_s16(output, vreinterpretq_s16_u16(tmp_low));
+    vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high));
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const int16x8x2_t ta1  = vld2q_s16(input1);
+    const uint8x8x2_t ta2u = vld2_u8(input2);
+    const int16x8x2_t ta2 =
+    {
+        {
+            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
+            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
+        }
+    };
+
+    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+    vst2q_s16(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    // Simply swap the two input buffers
+    mul_S16_U8_S16_n<is_scale255, is_sat>(input2_ptr, input1_ptr, output_ptr, n);
+}
+} // namespace
+
+NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
+    : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+{
+}
+
+void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+    {
+        // All data types must be QS8
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+    }
+
+    _input1         = input1;
+    _input2         = input2;
+    _output         = output;
+    _scale          = scale;
+    _scale_exponent = 0;
+    _func_int       = nullptr;
+    _func_q_int     = nullptr;
+    _func_float     = nullptr;
+
+    bool is_scale_255 = false;
+    // Check and validate scaling factor
+    if(std::abs(scale - scale255_constant) < 0.00001f)
+    {
+        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_UNUSED(rounding_policy);
+
+        is_scale_255 = true;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
+        ARM_COMPUTE_UNUSED(rounding_policy);
+
+        int         exponent            = 0;
+        const float normalized_mantissa = std::frexp(scale, &exponent);
+
+        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+        // Moreover, it will be negative as we deal with 1/2^n
+        if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+        {
+            // Store the positive exponent. We know that we compute 1/2^n
+            // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+            _scale_exponent = std::abs(exponent - 1);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Scale value not supported (Should be 1/(2^n) or 1/255");
+        }
+    }
+
+    const DataType dt_input1 = input1->info()->data_type();
+    const DataType dt_input2 = input2->info()->data_type();
+    const DataType dt_output = output->info()->data_type();
+    const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
+
+    if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_U8_U8_n<true, true> : &mul_U8_U8_U8_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_U8_U8_n<false, true> : &mul_U8_U8_U8_n<false, false>;
+        }
+    }
+    else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_S16_S16_S16_n<true, true> : &mul_S16_S16_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_S16_S16_S16_n<false, true> : &mul_S16_S16_S16_n<false, false>;
+        }
+    }
+    else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_S16_U8_S16_n<true, true> : &mul_S16_U8_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_S16_U8_S16_n<false, true> : &mul_S16_U8_S16_n<false, false>;
+        }
+    }
+    else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_S16_S16_n<true, true> : &mul_U8_S16_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_S16_S16_n<false, true> : &mul_U8_S16_S16_n<false, false>;
+        }
+    }
+    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_U8_S16_n<true, true> : &mul_U8_U8_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
+        }
+    }
+    else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
+        }
+        else
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
+        }
+    }
+    else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
+    {
+        _func_float = &mul_F32_F32_F32_n<false, false>;
+        _func_int   = nullptr;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called with the wrong img formats");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEPixelWiseMultiplicationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    if(_func_int != nullptr)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
+        },
+        input1, input2, output);
+    }
+    else if(_func_q_int != nullptr)
+    {
+        int fixed_point_position = _input1->info()->fixed_point_position();
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+        },
+        input1, input2, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
+        },
+        input1, input2, output);
+    }
+}
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..30b67b64b9
--- /dev/null
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <limits>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = id.x() * stride_x - pad_x;
+    int start_y = id.y() * stride_y - pad_y;
+    int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+                                      int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+    static std::array<qint8_t, 10> scale_values_q8 =
+    { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int val     = ((end_y - start_y) * (end_x - start_x));
+    return scale_values_q8[val] >> (7 - fixed_point_position);
+}
+} // namespace
+
+NEPoolingLayerKernel::NEPoolingLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+{
+}
+
+BorderSize NEPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                   pool_pad_x      = 0;
+    int                   pool_pad_y      = 0;
+    int                   pool_stride_x   = 0;
+    int                   pool_stride_y   = 0;
+    unsigned int          pooled_w        = 0;
+    unsigned int          pooled_h        = 0;
+    PoolingType           pool_type       = pool_info.pool_type();
+    int                   pool_size       = pool_info.pool_size();
+    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
+    DimensionRoundingType pool_round      = pad_stride_info.round();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                                     pool_size, pool_stride_x, pool_stride_y,
+                                                     pool_pad_x, pool_pad_y, pool_round);
+    ARM_COMPUTE_UNUSED(pooled_w);
+    ARM_COMPUTE_UNUSED(pooled_h);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    unsigned int num_elems_read_per_iteration      = 0;
+    unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_horizontal_window       = 0;
+
+    // Select element size
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            num_elems_read_per_iteration      = 16;
+            num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
+            num_elems_horizontal_window       = 8;
+            break;
+        case DataType::F32:
+            num_elems_read_per_iteration      = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+            num_elems_processed_per_iteration = 1;
+            num_elems_horizontal_window       = 1;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+    const int input_width              = input->info()->dimension(0);
+    const int input_height             = input->info()->dimension(1);
+    const int upper_bound_w            = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h            = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    // Set instance variables
+    _input              = input;
+    _output             = output;
+    _pool_info          = pool_info;
+    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
+    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+    // Select appropriate function
+    switch(pool_size)
+    {
+        case 2:
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+            }
+            break;
+        case 3:
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported pooling size");
+            break;
+    }
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    INEKernel::configure(win);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 2;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling
+            const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
+            res                       = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
+            res                       = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size = 2;
+    int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t       res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+        }
+        else
+        {
+            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+            res                        = vpmax_f32(max_data, max_data);
+        }
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 3;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling for stride 2
+            const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
+            const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
+            const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
+            const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_sum);
+            }
+            res = vqmul_qs8(res, scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data  = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
+            const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
+            const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
+            const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
+
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_max);
+            }
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size = 3;
+    int                 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t       res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+            res                        = vpmax_f32(res, res);
+        }
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    unsigned int pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y)    = _pool_info.pad_stride_info().stride();
+
+    // Set step for input in x and y direction for the input
+    Window       window_input(window);
+    unsigned int window_x_inc = 0;
+    if(_input->info()->data_type() == DataType::QS8)
+    {
+        window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+    }
+    else
+    {
+        window_x_inc = pool_stride_x;
+    }
+    window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+    window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+
+    // Run function
+    (this->*_func)(window_input, window);
+}
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
new file mode 100644
index 0000000000..c3c44a5f32
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
+{
+    static const float32x4_t lowerxy = vdupq_n_f32(-1.0f);
+
+    float32x4_t x = vld1q_f32(mapx_ptr);
+    float32x4_t y = vld1q_f32(mapy_ptr);
+
+    // Clamp x coordinates
+    x = vmaxq_f32(lowerxy, vminq_f32(x, width));
+    y = vmaxq_f32(lowerxy, vminq_f32(y, height));
+
+    const int32x4_t x_s32 = vcvtq_s32_f32(x);
+    const int32x4_t y_s32 = vcvtq_s32_f32(y);
+
+    return vmlaq_s32(x_s32, y_s32, stride);
+}
+
+} // namespace
+
+NERemapKernel::NERemapKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NERemapKernel::remap_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            _func = &NERemapKernel::remap_bilinear;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              AccessWindowRectangle(map_x->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              AccessWindowRectangle(map_y->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NERemapKernel::remap_nearest(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const float32x4_t width     = vdupq_n_f32(static_cast<float>(_input->info()->dimension(0)));
+    const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
+    const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
+        const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
+        const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
+        const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp0, 0);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp0, 1);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp0, 2);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp0, 3);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp0, 4);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp0, 5);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp0, 6);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp1, 0);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp1, 1);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp1, 2);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp1, 3);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp1, 4);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp1, 5);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp1, 6);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::remap_bilinear(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const size_t width     = _input->info()->dimension(0);
+    const size_t height    = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
new file mode 100644
index 0000000000..fd2978de1c
--- /dev/null
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEScaleKernel::NEScaleKernel()
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NEScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+    }
+
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input   = input;
+    _output  = output;
+    _offsets = offsets;
+    _dx      = dx;
+    _dy      = dy;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NEScaleKernel::scale_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32);
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32);
+
+            _func = &NEScaleKernel::scale_bilinear;
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            _func = &NEScaleKernel::scale_area;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              offsets_access,
+                              dx_access,
+                              dy_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEScaleKernel::scale_nearest(const Window &window)
+{
+    const size_t input_stride = _input->info()->strides_in_bytes()[1];
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_off;
+    win_off.set(Window::DimX, window[Window::DimX]);
+    win_off.set(Window::DimY, window[Window::DimY]);
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator offsets(_offsets, win_off);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+        {
+            uint8x16_t tmp = vdupq_n_u8(0);
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const uint8_t *const in_ptr      = in.ptr();
+
+                const size_t in_yi      = (id.y() + 0.5f) * hr;
+                const size_t offset_row = in_yi * input_stride;
+
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
+
+                vst1q_u8(out.ptr(), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+        case DataType::S16:
+        {
+            int16x8x2_t tmp =
+            {
+                {
+                    vdupq_n_s16(0),
+                    vdupq_n_s16(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const size_t in_yi      = (id.y() + 0.5f) * hr;
+                const size_t offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
+
+                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void NEScaleKernel::scale_bilinear(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator offsets(_offsets, win_off);
+    Iterator dx(_dx, win_off);
+    Iterator dy(_dy, win_off);
+
+    /* Input image stride */
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+        const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+        const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+        const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+
+        const size_t in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+        const size_t offset_row = in_yi * in_stride;
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, offsets, dx, dy, out);
+}
+
+void NEScaleKernel::scale_area(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const auto   wr        = static_cast<float>(_input->info()->dimension(0)) / static_cast<float>(_output->info()->dimension(0));
+    const auto   hr        = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+    const auto   w         = _input->info()->dimension(0);
+    const auto   h         = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out);
+}
+
+void NEScaleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..183df1efcb
--- /dev/null
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+const int16x8_t three       = vdupq_n_s16(3);
+const int16x8_t minus_three = vdupq_n_s16(-3);
+const int16x8_t ten         = vdupq_n_s16(10);
+const int16x8_t minus_ten   = vdupq_n_s16(-10);
+
+inline int16x8_t scharr_y(const int16x8x2_t &top, const int16x8x2_t &bottom)
+{
+    // Top left
+    int16x8_t out = vmulq_s16(top.val[0], minus_three);
+    // Top center
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 1), minus_ten);
+    // Top right
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), minus_three);
+
+    // Bottom left
+    out = vmlaq_s16(out, bottom.val[0], three);
+    // Bottom center
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 1), ten);
+    // Bottom right
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
+
+    return out;
+}
+
+inline int16x8_t scharr_x(const int16x8x2_t &top, const int16x8x2_t &middle, const int16x8x2_t &bottom)
+{
+    // Top left
+    int16x8_t out = vmulq_s16(top.val[0], minus_three);
+    // Top right
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), three);
+
+    // Middle left
+    out = vmlaq_s16(out, middle.val[0], minus_ten);
+    // Middle right
+    out = vmlaq_s16(out, vextq_s16(middle.val[0], middle.val[1], 2), ten);
+
+    // Bottom left
+    out = vmlaq_s16(out, bottom.val[0], minus_three);
+    // Bottom right
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
+
+    return out;
+}
+} // namespace
+
+NEScharr3x3Kernel::NEScharr3x3Kernel()
+    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+void NEScharr3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_scharr_x = output_x != nullptr;
+    _run_scharr_y = output_y != nullptr;
+
+    if(_run_scharr_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_scharr_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+BorderSize NEScharr3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEScharr3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    Iterator input(_input, window);
+    Iterator output_y;
+    Iterator output_x;
+
+    if(_run_scharr_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_scharr_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_scharr_x && _run_scharr_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_scharr_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
+        },
+        input, output_x);
+    }
+    else if(_run_scharr_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
+        },
+        input, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
new file mode 100644
index 0000000000..ab08a1cfeb
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NESobel3x3Kernel::NESobel3x3Kernel()
+    : _run_sobel_x(false), _run_sobel_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize NESobel3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, 1));
+
+    Iterator input(_input, window);
+    Iterator output_y;
+    Iterator output_x;
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    static const int16x8_t two      = vdupq_n_s16(2);
+    static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL Y
+            //top left
+            int16x8_t out_y = vnegq_s16(top_s16.val[0]);
+            //top mid
+            out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
+            //top right
+            out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //bot left
+            out_y = vaddq_s16(out_y, bot_s16.val[0]);
+            //bot mid
+            out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+            //bot right
+            out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
+
+            //SOBEL X
+            //top left
+            int16x8_t out_x = vnegq_s16(top_s16.val[0]);
+            //top right
+            out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo);
+            //mid right
+            out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+            //bot left
+            out_x = vsubq_s16(out_x, bot_s16.val[0]);
+            //bot right
+            out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL X
+            //top left
+            int16x8_t out = vnegq_s16(top_s16.val[0]);
+            //top right
+            out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            out = vmlaq_s16(out, mid_s16.val[0], minustwo);
+            //mid right
+            out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+            //bot left
+            out = vsubq_s16(out, bot_s16.val[0]);
+            //bot right
+            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL Y
+            //top left
+            int16x8_t out = vnegq_s16(top_s16.val[0]);
+            //top mid
+            out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
+            //top right
+            out = vsubq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //bot left
+            out = vaddq_s16(out, bot_s16.val[0]);
+            //bot mid
+            out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+            //bot right
+            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
+        },
+        input, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
new file mode 100644
index 0000000000..488eee1176
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NESobel5x5HorKernel::NESobel5x5HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize NESobel5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel5x5HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output_x;
+    Iterator output_y;
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        static const int16x8_t six      = vdupq_n_s16(6);
+        static const int16x8_t four     = vdupq_n_s16(4);
+        static const int16x8_t two      = vdupq_n_s16(2);
+        static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out_y = data_s16.val[0];
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+            out_y           = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
+
+            int16x8_t out_x = vnegq_s16(data_s16.val[0]);
+            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
+            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
+            out_x           = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        static const int16x8_t two      = vdupq_n_s16(2);
+        static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out = vnegq_s16(data_s16.val[0]);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
+            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        static const int16x8_t six  = vdupq_n_s16(6);
+        static const int16x8_t four = vdupq_n_s16(4);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out = data_s16.val[0];
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
+        },
+        input, output_y);
+    }
+}
+
+NESobel5x5VertKernel::NESobel5x5VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize NESobel5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    const ITensor *const input = _run_sobel_x ? input_x : input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel5x5VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input_x;
+    Iterator input_y;
+    Iterator output_x;
+    Iterator output_y;
+
+    const int16_t *input_x_low2_ptr = nullptr;
+    const int16_t *input_x_low_ptr  = nullptr;
+    const int16_t *input_x_mid_ptr  = nullptr;
+    const int16_t *input_x_top_ptr  = nullptr;
+    const int16_t *input_x_top2_ptr = nullptr;
+
+    const int16_t *input_y_low2_ptr = nullptr;
+    const int16_t *input_y_low_ptr  = nullptr;
+    const int16_t *input_y_top_ptr  = nullptr;
+    const int16_t *input_y_top2_ptr = nullptr;
+
+    if(_run_sobel_x)
+    {
+        input_x          = Iterator(_input_x, window);
+        output_x         = Iterator(_output_x, window);
+        input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
+        input_x_top_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
+        input_x_mid_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
+        input_x_low_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
+        input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
+    }
+
+    if(_run_sobel_y)
+    {
+        input_y          = Iterator(_input_y, window);
+        output_y         = Iterator(_output_y, window);
+        input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
+        input_y_top_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
+        input_y_low_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
+        input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
+    }
+
+    static const int16x8_t six      = vdupq_n_s16(6);
+    static const int16x8_t four     = vdupq_n_s16(4);
+    static const int16x8_t two      = vdupq_n_s16(2);
+    static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+    if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Convert offset from uint8_t* to uint16_t*
+            const size_t input_offset_high_s16 = input_x.offset() / 2;
+            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
+
+            //HIGH DATA
+            //top2
+            int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
+            int16x8_t out_high  = data_high;
+            //top
+            data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, four);
+            //mid
+            data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, six);
+            //low
+            data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, four);
+            //low2
+            data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
+            out_high  = vaddq_s16(out_high, data_high);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
+
+            //LOW DATA
+            //top2
+            int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
+            int16x8_t out_low  = data_low;
+            //top
+            data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, four);
+            //mid
+            data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, six);
+            //low
+            data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, four);
+            //low2
+            data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
+            out_low  = vaddq_s16(out_low, data_low);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
+        },
+        input_x, output_x);
+    }
+
+    if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Convert offset from uint8_t* to uint16_t*
+            const size_t input_offset_high_s16 = input_y.offset() / 2;
+            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
+
+            //HIGH DATA
+            //top2
+            int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
+            int16x8_t out_high  = vnegq_s16(data_high);
+            //top
+            data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, minustwo);
+            //low
+            data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, two);
+            //low2
+            data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
+            out_high  = vaddq_s16(out_high, data_high);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
+
+            //LOW DATA
+            //top2
+            int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
+            int16x8_t out_low  = vnegq_s16(data_low);
+            //top
+            data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, minustwo);
+            //low
+            data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, two);
+            //low2
+            data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
+            out_low  = vaddq_s16(out_low, data_low);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
+        },
+        input_y, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
new file mode 100644
index 0000000000..9761942c69
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+const int32x4_t minusfour = vdupq_n_s32(-4);
+const int32x4_t minusfive = vdupq_n_s32(-5);
+const int32x4_t four      = vdupq_n_s32(4);
+const int32x4_t five      = vdupq_n_s32(5);
+const int32x4_t six       = vdupq_n_s32(6);
+const int32x4_t fifteen   = vdupq_n_s32(15);
+const int32x4_t twenty    = vdupq_n_s32(20);
+
+inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
+{
+    int32x4x2_t out =
+    {
+        {
+            vnegq_s32(data.val[0]),
+            vnegq_s32(data.val[1])
+        }
+    };
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 1), minusfour);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 2), minusfive);
+
+    out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 1), four);
+
+    out.val[0] = vaddq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 2));
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 1), minusfour);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 2), minusfive);
+
+    out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 1), four);
+
+    out.val[1] = vaddq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 2));
+
+    return out;
+}
+
+inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
+{
+    int32x4x2_t out =
+    {
+        {
+            data.val[0],
+            data.val[1]
+        }
+    };
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 1), six);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 2), fifteen);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 3), twenty);
+
+    out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 1), six);
+
+    out.val[0] = vaddq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 2));
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 1), six);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 2), fifteen);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 3), twenty);
+
+    out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 1), six);
+
+    out.val[1] = vaddq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 2));
+
+    return out;
+}
+} // namespace
+
+NESobel7x7HorKernel::NESobel7x7HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize NESobel7x7HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel7x7HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output_x;
+    Iterator output_y;
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
+
+            const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
+        },
+        input, output_y);
+    }
+}
+
+NESobel7x7VertKernel::NESobel7x7VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize NESobel7x7VertKernel::border_size() const
+{
+    return BorderSize(3, 0);
+}
+
+void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = (output_x != nullptr);
+    _run_sobel_y = (output_y != nullptr);
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    const ITensor *const input = _run_sobel_x ? input_x : input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel7x7VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input_x;
+    Iterator input_y;
+    Iterator output_x;
+    Iterator output_y;
+
+    int32_t in_x_stride = 0;
+    int32_t in_y_stride = 0;
+
+    if(_run_sobel_x)
+    {
+        input_x     = Iterator(_input_x, window);
+        output_x    = Iterator(_output_x, window);
+        in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
+    }
+
+    if(_run_sobel_y)
+    {
+        input_y     = Iterator(_input_y, window);
+        output_y    = Iterator(_output_y, window);
+        in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
+    }
+
+    if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
+
+            //top3
+            int32x4x2_t data =
+            {
+                {
+                    vld1q_s32(in_ptr),
+                    vld1q_s32(in_ptr + 4)
+                }
+            };
+
+            int32x4x2_t out = data;
+
+            //top2
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
+
+            //top
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
+
+            //mid
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], twenty);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], twenty);
+
+            //low
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
+
+            //low2
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
+
+            //low3
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
+
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
+        },
+        input_x, output_x);
+    }
+
+    if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
+
+            //top3
+            int32x4x2_t data =
+            {
+                {
+                    vld1q_s32(in_ptr),
+                    vld1q_s32(in_ptr + 4)
+                }
+            };
+
+            int32x4x2_t out =
+            {
+                {
+                    vnegq_s32(data.val[0]),
+                    vnegq_s32(data.val[1])
+                }
+            };
+
+            //top2
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfour);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfour);
+
+            //top
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfive);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfive);
+
+            //low
+            in_ptr += (2 * in_y_stride);
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], five);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], five);
+
+            //low2
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], four);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], four);
+
+            //low3
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
+
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
+        },
+        input_y, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..942662e84b
--- /dev/null
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cfloat>
+
+using namespace arm_compute;
+
+namespace
+{
+void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto        in_ptr        = reinterpret_cast<const float *>(input.ptr());
+            const float32x4_t current_value = vld1q_f32(in_ptr);
+            vec_max                         = vmaxq_f32(vec_max, current_value);
+        },
+        input);
+
+        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
+        carry_max             = vpmax_f32(carry_max, carry_max);
+
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        qint8x16_t vec_max = vdupq_n_s8(-1);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
+            const qint8x16_t current_value = vld1q_qs8(in_ptr);
+            vec_max                        = vmaxq_qs8(vec_max, current_value);
+        },
+        input);
+
+        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+
+        *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} // namespace
+
+NELogits1DMaxKernel::NELogits1DMaxKernel()
+    : _func(nullptr), _border_size()
+{
+}
+
+BorderSize NELogits1DMaxKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    const int    input_width                       = input->info()->valid_region().shape.x();
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_max_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_max_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_row = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_row, 1.f / input_width);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DMaxKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
+
+namespace
+{
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step        = 4;
+    const int     long_steps  = in->info()->valid_region().shape.x() / step;
+    const int     small_steps = in->info()->valid_region().shape.x() % step;
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+
+        // Init sum to zero
+        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+        // Get max value
+        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
+        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            float32x4_t vec_elements = vld1q_f32(in_ptr);
+            vec_elements             = vsubq_f32(vec_elements, vec_max);
+            vec_elements             = vexpq_f32(vec_elements);
+
+            vst1q_f32(exp_ptr, vec_elements);
+            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+
+        // Reduce sum
+        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+        float sum                  = vget_lane_f32(carry_addition, 0);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            float element = std::exp(in_ptr[i] - *max_ptr);
+            exp_ptr[i]    = element;
+            sum += element;
+        }
+
+        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step                 = 8;
+    const int     long_steps           = in->info()->valid_region().shape.x() / step;
+    const int     small_steps          = in->info()->valid_region().shape.x() % step;
+    const int     fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
+
+        // Init sum to zero
+        qint16x8_t vec_sum_value = vdupq_n_qs16(0);
+
+        // Get max value
+        const auto      max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
+        const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            qint8x8_t vec_elements = vld1_qs8(in_ptr);
+            vec_elements           = vqsub_qs8(vec_elements, vec_max);
+            vec_elements           = vqexp_qs8(vec_elements, fixed_point_position);
+
+            vst1_qs8(exp_ptr, vec_elements);
+            vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+        // Reduce sum
+        const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
+        const qint16_t   sum0    = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
+        const qint16_t   sum1    = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
+        qint16_t         sum     = sqadd_qs16(sum0, sum1);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
+            exp_ptr[i]      = element;
+            sum             = sqadd_qs16(sum, element);
+        }
+
+        *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} //namespace
+
+NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &logits_1d_shift_exp_sum_qs8;
+            break;
+        case DataType::F32:
+            _func = &logits_1d_shift_exp_sum_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DShiftExpSumKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _max, _output, _sum, window);
+}
+
+namespace
+{
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
+        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
+
+            const float32x4_t vec_in           = vld1q_f32(in_ptr);
+            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
+
+            vst1q_f32(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    const int fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const int8_t     sum_value        = *reinterpret_cast<const qint8_t *>(_sum.ptr());
+        const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
+
+            const qint8x16_t vec_in           = vld1q_qs8(in_ptr);
+            const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
+
+            vst1q_qs8(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
+
+NELogits1DNormKernel::NELogits1DNormKernel()
+    : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_norm_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_norm_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DNormKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _sum, _output, window);
+}
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
new file mode 100644
index 0000000000..f0b58d82f6
--- /dev/null
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+
+constexpr unsigned int num_num_elems_processed_per_iteration = 16;
+} // namespace arm_compute
+
+NETableLookupKernel::NETableLookupKernel()
+    : _func(nullptr), _lut(nullptr)
+{
+}
+
+template <class T>
+void NETableLookupKernel::tableLookup(const Window &window)
+{
+    uint32_t     offset = _lut->index_offset();
+    size_t       count  = _lut->num_elements();
+    const auto   lut    = reinterpret_cast<const T *>(_lut->buffer());
+    unsigned int step   = num_num_elems_processed_per_iteration;
+
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+
+    Iterator input  = Iterator(_input, window);
+    Iterator output = Iterator(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        for(unsigned int i = 0; i < step; ++i, ++input_ptr, ++output_ptr)
+        {
+            const int32_t index = offset + *input_ptr;
+
+            if(0 <= index && index < static_cast<int32_t>(count))
+            {
+                *output_ptr = lut[index];
+            }
+        }
+    },
+    input, output);
+}
+
+namespace arm_compute
+{
+template <>
+void NETableLookupKernel::tableLookup<uint8_t>(const Window &window)
+{
+    const uint8_t *const lut  = _lut->buffer();
+    unsigned int         step = num_num_elems_processed_per_iteration;
+
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+
+    Iterator input  = Iterator(_input, window);
+    Iterator output = Iterator(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8_t *input_ptr  = input.ptr();
+        uint8_t       *output_ptr = output.ptr();
+
+        for(unsigned int i = 0; i < step; ++i)
+        {
+            *output_ptr++ = lut[*input_ptr++];
+        }
+    },
+    input, output);
+}
+} // namespace arm_compute
+
+void NETableLookupKernel::configure(const ITensor *input, const ILut *lut, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _lut = lut;
+
+    if(input->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8)
+    {
+        _func = &NETableLookupKernel::tableLookup<uint8_t>;
+    }
+    else if(input->info()->data_type() == DataType::S16 && output->info()->data_type() == DataType::S16)
+    {
+        _func = &NETableLookupKernel::tableLookup<int16_t>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of input and output DataType.");
+    }
+
+    INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration);
+}
+
+void NETableLookupKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
new file mode 100644
index 0000000000..72031195d9
--- /dev/null
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEThresholdKernel::NEThresholdKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _threshold(0), _false_value(0), _true_value(0), _upper(0)
+{
+}
+
+void NEThresholdKernel::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input       = input;
+    _output      = output;
+    _threshold   = threshold;
+    _false_value = false_value;
+    _true_value  = true_value;
+    _upper       = upper;
+
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            _func = &NEThresholdKernel::run_binary;
+            break;
+        case ThresholdType::RANGE:
+            _func = &NEThresholdKernel::run_range;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognized");
+            break;
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+inline void NEThresholdKernel::run_binary(const Window &window)
+{
+    const uint8x16_t threshold   = vdupq_n_u8(_threshold);
+    const uint8x16_t true_value  = vdupq_n_u8(_true_value);
+    const uint8x16_t false_value = vdupq_n_u8(_false_value);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+        const uint8x16_t mask = vcgtq_u8(data, threshold);
+
+        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+    },
+    input, output);
+}
+
+inline void NEThresholdKernel::run_range(const Window &window)
+{
+    const uint8x16_t lower_threshold = vdupq_n_u8(_threshold);
+    const uint8x16_t upper_threshold = vdupq_n_u8(_upper);
+    const uint8x16_t true_value      = vdupq_n_u8(_true_value);
+    const uint8x16_t false_value     = vdupq_n_u8(_false_value);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        uint8x16_t mask = vcleq_u8(data, upper_threshold);
+
+        mask = vandq_u8(vcgeq_u8(data, lower_threshold), mask);
+
+        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+    },
+    input, output);
+}
+
+void NEThresholdKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
new file mode 100644
index 0000000000..492de8a6ee
--- /dev/null
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 3 * input_stride_in_bytes));
+        const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 4 * input_stride_in_bytes));
+        const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 5 * input_stride_in_bytes));
+        const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 6 * input_stride_in_bytes));
+        const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 7 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+        const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+        const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+        const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+        // Transpose 4x4
+        const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+        const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+        const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+        const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+        // Transpose 8x8
+        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+        const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+        const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+    },
+    input, output);
+}
+
+void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+        const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+        // Transpose 4x4
+        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
+    },
+    input, output);
+}
+
+void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+        const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+        const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+
+        // Swap block 01 with block 10 and store
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+    },
+    input, output);
+}
+} // namespace
+
+NETransposeKernel::NETransposeKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NETransposeKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func                             = &transpose_8bit_elements;
+            num_elems_processed_per_iteration = 8;
+            break;
+        case 2:
+            _func                             = &transpose_16bit_elements;
+            num_elems_processed_per_iteration = 4;
+            break;
+        case 4:
+            _func                             = &transpose_32bit_elements;
+            num_elems_processed_per_iteration = 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NETransposeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
new file mode 100644
index 0000000000..6c90a334af
--- /dev/null
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace
+{
+inline uint8_t nearest_interpolation(const uint8_t *in_ptr, int x, int y, size_t stride)
+{
+    return in_ptr[x + y * stride];
+}
+} // namespace
+
+INEWarpKernel::INEWarpKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _constant_border_value(0), _matrix(nullptr)
+{
+}
+
+void INEWarpKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void INEWarpKernel::configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    _matrix                = matrix;
+    _constant_border_value = constant_border_value;
+
+    switch(border_mode)
+    {
+        case BorderMode::UNDEFINED:
+            _func = &INEWarpKernel::warp_undefined;
+            break;
+        case BorderMode::CONSTANT:
+            _func = &INEWarpKernel::warp_constant;
+            break;
+        case BorderMode::REPLICATE:
+            _func = &INEWarpKernel::warp_replicate;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Border mode not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(1U));
+
+    const ValidRegion &input_valid_region = input->info()->valid_region();
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0], input_valid_region.anchor[1],
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0],
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1]);
+    AccessWindowHorizontal output_access(output->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M01 * x + M01 * y + M02
+    // y0 = M11 * x + M11 * y + M12
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    // Affine warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only write to output if x0 and y0 are within the valid region.
+        // Otherwise the read value would be undefined.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M01 * x + M01 * y + M02
+    // y0 = M11 * x + M11 * y + M12
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    // Affine warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only use input values if x0 and y0 are within the valid region.
+        // Otherwise write the constant border value.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            *out.ptr() = _constant_border_value;
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // Current row
+    int y_cur = window.y().start();
+
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only load from (x0, y0) if the point is within the valid region.
+        // Otherwise load from the edge of the valid region.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            // Clamp coordinates
+            const auto xi = clamp<int>(x0, min_x, max_x - 1);
+            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+
+            *out.ptr() = *(in.ptr() + xi + yi * stride);
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only write to output if xn and yn are within the valid region.
+        // Otherwise the read value would be undefined.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only use input values if xn and yn are within the valid region.
+        // Otherwise write the constant border value.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            *out.ptr() = _constant_border_value;
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only load from (x0, y0) if the point is within the valid region.
+        // Otherwise load from the edge of the valid region.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            // Clamp coordinates
+            const auto xi = clamp<int>(x0, min_x, max_x - 1);
+            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+
+            *out.ptr() = *(in.ptr() + xi + yi * stride);
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
+template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::BILINEAR>;
+template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
+template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>;
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..aa6be44bee
--- /dev/null
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+    const unsigned int kernel_size     = input->info()->dimension(0);
+    const unsigned int kernel_depth    = input->info()->dimension(2);
+    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size; ++i)
+                {
+                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(bias != nullptr)
+        {
+            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
+        }
+    },
+    in);
+}
+} // namespace
+
+NEWeightsReshapeKernel::NEWeightsReshapeKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
+
+    // Set data type and shape for output tensor if not yet configured
+    set_data_type_if_unknown(*output->info(), dt);
+    set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(bias != nullptr)
+    {
+        TensorShape bias_shape{ input->info()->tensor_shape()[3] };
+
+        // Set data type and shape for bias tensor if not yet configured
+        set_data_type_if_unknown(*bias->info(), dt);
+        set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
+        set_shape_if_empty(*bias->info(), bias_shape);
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            _func = &weights_reshape<uint32_t>;
+            break;
+        }
+        case DataType::QS8:
+        {
+            _func = &weights_reshape<uint8_t>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR_ON("Data type not supported");
+            break;
+        }
+    }
+
+    // Configure kernel
+    Window window = calculate_max_window(*input->info(), Steps());
+    window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+
+    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(window);
+}
+
+void NEWeightsReshapeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (*_func)(_input, _bias, _output, window);
+}
diff --git a/src/core/PyramidInfo.cpp b/src/core/PyramidInfo.cpp
new file mode 100644
index 0000000000..1c12eee46f
--- /dev/null
+++ b/src/core/PyramidInfo.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/PyramidInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+PyramidInfo::PyramidInfo()
+    : _num_levels(0), _tensor_shape(), _format(Format::UNKNOWN), _scale(0.0f)
+{
+}
+
+PyramidInfo::PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format)
+    : PyramidInfo()
+{
+    init(num_levels, scale, width, height, format);
+}
+
+PyramidInfo::PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format)
+    : PyramidInfo()
+{
+    init(num_levels, scale, tensor_shape, format);
+}
+
+void PyramidInfo::init(size_t num_levels, float scale, size_t width, size_t height, Format format)
+{
+    init(num_levels, scale, TensorShape(width, height), format);
+}
+
+void PyramidInfo::init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format)
+{
+    ARM_COMPUTE_ERROR_ON(0 == num_levels);
+    ARM_COMPUTE_ERROR_ON(0.0f == scale);
+    ARM_COMPUTE_ERROR_ON(0 == tensor_shape.x());
+    ARM_COMPUTE_ERROR_ON(0 == tensor_shape.y());
+    ARM_COMPUTE_ERROR_ON(Format::IYUV == format);
+    ARM_COMPUTE_ERROR_ON(Format::NV12 == format);
+    ARM_COMPUTE_ERROR_ON(Format::NV21 == format);
+    ARM_COMPUTE_ERROR_ON(Format::UYVY422 == format);
+    ARM_COMPUTE_ERROR_ON(Format::YUV444 == format);
+    ARM_COMPUTE_ERROR_ON(Format::YUYV422 == format);
+    ARM_COMPUTE_ERROR_ON_MSG(0 != _num_levels, "PyramidInfo already initialized");
+    ARM_COMPUTE_ERROR_ON(0 == (tensor_shape.x() * pow(scale, num_levels)));
+    ARM_COMPUTE_ERROR_ON(0 == (tensor_shape.y() * pow(scale, num_levels)));
+
+    _num_levels   = num_levels;
+    _format       = format;
+    _scale        = scale;
+    _tensor_shape = tensor_shape;
+}
+
+size_t PyramidInfo::num_levels() const
+{
+    return _num_levels;
+}
+
+size_t PyramidInfo::width() const
+{
+    return _tensor_shape.x();
+}
+
+size_t PyramidInfo::height() const
+{
+    return _tensor_shape.y();
+}
+
+const TensorShape &PyramidInfo::tensor_shape() const
+{
+    return _tensor_shape;
+}
+
+Format PyramidInfo::format() const
+{
+    return _format;
+}
+
+float PyramidInfo::scale() const
+{
+    return _scale;
+}
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
new file mode 100644
index 0000000000..f5a282df8a
--- /dev/null
+++ b/src/core/SubTensorInfo.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/SubTensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+SubTensorInfo::SubTensorInfo()
+    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+{
+}
+
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+
+    // Initialize valid region
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void SubTensorInfo::set_tensor_shape(TensorShape shape)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+    _tensor_shape = shape;
+}
+
+bool SubTensorInfo::extend_padding(const PaddingSize &padding)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+
+    // Extend parent padding if required
+    return _parent->extend_padding(padding);
+}
+
+size_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+    ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+    size_t         offset  = offset_first_element_in_bytes();
+    const Strides &strides = strides_in_bytes();
+
+    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    {
+        offset += pos[i] * strides[i];
+    }
+
+    return offset;
+}
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
new file mode 100644
index 0000000000..3d07ccb69a
--- /dev/null
+++ b/src/core/TensorInfo.cpp
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+TensorInfo::TensorInfo()
+    : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info)
+    : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _fixed_point_position          = info.fixed_point_position();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+}
+
+TensorInfo::TensorInfo(Format format)
+    : TensorInfo(TensorShape(), format)
+{
+}
+
+TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
+    : TensorInfo(TensorShape(width, height), format)
+{
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
+    : TensorInfo()
+{
+    init(tensor_shape, format);
+}
+
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position)
+    : TensorInfo()
+{
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+    : TensorInfo()
+{
+    init(hog_info, width, height);
+}
+
+void TensorInfo::init(Format format)
+{
+    init(TensorShape(), format);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, Format format)
+{
+    size_t         num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+
+    init(tensor_shape, num_channels, type);
+
+    _format = format;
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, Format format,
+                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+                      size_t total_size_in_bytes)
+{
+    size_t         num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+
+    init(tensor_shape, num_channels, type, strides_in_bytes, offset_first_element_in_bytes, total_size_in_bytes);
+
+    _format = format;
+}
+
+void TensorInfo::init(size_t num_channels, DataType data_type, size_t fixed_point_position)
+{
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
+
+    set_tensor_shape(tensor_shape);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
+                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+                      size_t total_size_in_bytes, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position          = fixed_point_position;
+    _data_type                     = data_type;
+    _num_channels                  = num_channels;
+    _format                        = Format::UNKNOWN;
+    _tensor_shape                  = tensor_shape;
+    _offset_first_element_in_bytes = offset_first_element_in_bytes;
+    _strides_in_bytes              = strides_in_bytes;
+    _total_size                    = total_size_in_bytes;
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::init(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+{
+    // Number of cells for each block
+    const Size2D num_cells_per_block = hog_info.num_cells_per_block();
+
+    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
+    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+
+    // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
+    const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
+
+    init(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+}
+
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
+{
+    const size_t   num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+    size_t         total_size   = init_auto_padding(tensor_shape, num_channels, type);
+
+    _format = format;
+
+    return total_size;
+}
+
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
+    _tensor_shape         = tensor_shape;
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+
+    auto_padding();
+
+    return _total_size;
+}
+
+size_t TensorInfo::init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+{
+    // Number of cells for each block
+    const Size2D num_cells_per_block = hog_info.num_cells_per_block();
+
+    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
+    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+
+    // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
+    const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
+
+    return init_auto_padding(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+}
+
+bool TensorInfo::auto_padding()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_resizable);
+
+    // Some kernels compute 32 elements at the time, worst case scenario they
+    // will read 32 values after the last element
+    const size_t extra_pad_x = _tensor_shape.num_dimensions() < 1 ? 0 : 32;
+    const size_t pad_x       = _tensor_shape.num_dimensions() < 1 ? 0 : 4;
+    const size_t pad_y       = _tensor_shape.num_dimensions() < 2 ? 0 : 4;
+
+    return extend_padding(PaddingSize(pad_y, pad_x + extra_pad_x, pad_y, pad_x));
+}
+
+std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(const PaddingSize &padding)
+{
+    // Calculate resulting stride for the X, Y and Z dimension
+    const size_t stride_x = element_size();
+    const size_t stride_y = (padding.left + _tensor_shape[0] + padding.right) * stride_x;
+    const size_t stride_z = (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
+
+    Strides      required_strides;
+    size_t       required_total_size           = 0;
+    const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
+
+    switch(_tensor_shape.num_dimensions())
+    {
+        case 0:
+        {
+            if(_tensor_shape.total_size() > 0)
+            {
+                required_strides    = Strides(stride_x);
+                required_total_size = stride_z;
+            }
+            break;
+        }
+        case 1:
+            required_strides    = compute_strides(*this, stride_x);
+            required_total_size = stride_z;
+            break;
+        case 2:
+            required_strides    = compute_strides(*this, stride_x, stride_y);
+            required_total_size = stride_z;
+            break;
+        default:
+        {
+            required_strides = compute_strides(*this, stride_x, stride_y, stride_z);
+
+            const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+
+            required_total_size = _tensor_shape[idx_last_dimension] * required_strides[idx_last_dimension];
+            break;
+        }
+    }
+
+    return std::make_tuple(required_strides, required_offset_first_element, required_total_size);
+}
+
+bool TensorInfo::extend_padding(const PaddingSize &padding)
+{
+    ARM_COMPUTE_ERROR_ON(!_is_resizable);
+
+    bool updated = false;
+
+    if(padding.top > _padding.top)
+    {
+        _padding.top = padding.top;
+        updated      = true;
+    }
+
+    if(padding.right > _padding.right)
+    {
+        _padding.right = padding.right;
+        updated        = true;
+    }
+
+    if(padding.bottom > _padding.bottom)
+    {
+        _padding.bottom = padding.bottom;
+        updated         = true;
+    }
+
+    if(padding.left > _padding.left)
+    {
+        _padding.left = padding.left;
+        updated       = true;
+    }
+
+    std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+
+    return updated;
+}
+
+void TensorInfo::set_data_type(DataType data_type)
+{
+    _data_type = data_type;
+    _format    = Format::UNKNOWN;
+}
+
+void TensorInfo::set_num_channels(int num_channels)
+{
+    _num_channels = num_channels;
+    _format       = Format::UNKNOWN;
+}
+
+void TensorInfo::set_format(Format format)
+{
+    _format = format;
+
+    if(_data_type == DataType::UNKNOWN)
+    {
+        _num_channels = num_channels_from_format(format);
+        _data_type    = data_type_from_format(format);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
+        ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
+    }
+}
+
+void TensorInfo::set_tensor_shape(TensorShape shape)
+{
+    _tensor_shape                  = shape;
+    _offset_first_element_in_bytes = 0;
+    _strides_in_bytes              = compute_strides(*this);
+
+    if(_tensor_shape.num_dimensions() == 0)
+    {
+        _total_size = _strides_in_bytes[0];
+    }
+    else
+    {
+        const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+        _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+    }
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::set_fixed_point_position(int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+    _fixed_point_position = fixed_point_position;
+}
+
+size_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+    ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+    size_t offset = _offset_first_element_in_bytes;
+
+    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    {
+        offset += pos[i] * _strides_in_bytes[i];
+    }
+
+    return offset;
+}
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
new file mode 100644
index 0000000000..bf005c12f6
--- /dev/null
+++ b/src/core/Utils.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+std::string arm_compute::build_information()
+{
+    static const std::string information =
+#include "arm_compute_version.embed"
+        ;
+    return information;
+}
+
+std::string arm_compute::read_file(const std::string &filename, bool binary)
+{
+    std::string   out;
+    std::ifstream fs;
+
+    try
+    {
+        fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+        std::ios_base::openmode mode = std::ios::in;
+
+        if(binary)
+        {
+            mode |= std::ios::binary;
+        }
+
+        fs.open(filename, mode);
+
+        // Go to the end of the file
+        fs.seekg(0, std::ios::end);
+        // Reserve the memory required to store the file's content
+        out.reserve(fs.tellg());
+        // Go back to the beginning of the file
+        fs.seekg(0, std::ios::beg);
+        // Copy the content of the file
+        out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
+    }
+    catch(const std::ifstream::failure &e)
+    {
+        ARM_COMPUTE_ERROR("Accessing %s: %s", filename.c_str(), e.what());
+    }
+
+    return out;
+}
+
+const std::string &arm_compute::string_from_format(Format format)
+{
+    static std::map<Format, const std::string> formats_map =
+    {
+        { Format::UNKNOWN, "UNKNOWN" },
+        { Format::U8, "U8" },
+        { Format::S16, "S16" },
+        { Format::U16, "U16" },
+        { Format::S32, "S32" },
+        { Format::U32, "U32" },
+        { Format::F16, "F16" },
+        { Format::F32, "F32" },
+        { Format::UV88, "UV88" },
+        { Format::RGB888, "RGB888" },
+        { Format::RGBA8888, "RGBA8888" },
+        { Format::YUV444, "YUV444" },
+        { Format::YUYV422, "YUYV422" },
+        { Format::NV12, "NV12" },
+        { Format::NV21, "NV21" },
+        { Format::IYUV, "IYUV" },
+        { Format::UYVY422, "UYVY422" }
+    };
+
+    return formats_map[format];
+}
+
+const std::string &arm_compute::string_from_channel(Channel channel)
+{
+    static std::map<Channel, const std::string> channels_map =
+    {
+        { Channel::UNKNOWN, "UNKNOWN" },
+        { Channel::R, "R" },
+        { Channel::G, "G" },
+        { Channel::B, "B" },
+        { Channel::A, "A" },
+        { Channel::Y, "Y" },
+        { Channel::U, "U" },
+        { Channel::V, "V" },
+        { Channel::C0, "C0" },
+        { Channel::C1, "C1" },
+        { Channel::C2, "C2" },
+        { Channel::C3, "C3" }
+    };
+
+    return channels_map[channel];
+}
+
+const std::string &arm_compute::string_from_data_type(DataType dt)
+{
+    static std::map<DataType, const std::string> dt_map =
+    {
+        { DataType::UNKNOWN, "UNKNOWN" },
+        { DataType::S8, "S8" },
+        { DataType::U8, "U8" },
+        { DataType::QS8, "QS8" },
+        { DataType::S16, "S16" },
+        { DataType::U16, "U16" },
+        { DataType::QS16, "QS16" },
+        { DataType::S32, "S32" },
+        { DataType::U32, "U32" },
+        { DataType::S64, "S64" },
+        { DataType::U64, "U64" },
+        { DataType::F16, "F16" },
+        { DataType::F32, "F32" },
+        { DataType::F64, "F64" },
+        { DataType::SIZET, "SIZET" },
+    };
+
+    return dt_map[dt];
+}
+
+const std::string &arm_compute::string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
+{
+    static std::map<ActivationLayerInfo::ActivationFunction, const std::string> act_map =
+    {
+        { ActivationLayerInfo::ActivationFunction::ABS, "ABS" },
+        { ActivationLayerInfo::ActivationFunction::LINEAR, "LINEAR" },
+        { ActivationLayerInfo::ActivationFunction::LOGISTIC, "LOGISTIC" },
+        { ActivationLayerInfo::ActivationFunction::RELU, "RELU" },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, "BRELU" },
+        { ActivationLayerInfo::ActivationFunction::SOFT_RELU, "SRELU" },
+        { ActivationLayerInfo::ActivationFunction::SQRT, "SQRT" },
+        { ActivationLayerInfo::ActivationFunction::SQUARE, "SQUARE" },
+        { ActivationLayerInfo::ActivationFunction::TANH, "TANH" },
+    };
+
+    return act_map[act];
+}
+
+const std::string &arm_compute::string_from_matrix_pattern(MatrixPattern pattern)
+{
+    static std::map<MatrixPattern, const std::string> pattern_map =
+    {
+        { MatrixPattern::BOX, "BOX" },
+        { MatrixPattern::CROSS, "CROSS" },
+        { MatrixPattern::DISK, "DISK" },
+        { MatrixPattern::OTHER, "OTHER" },
+    };
+
+    return pattern_map[pattern];
+}
+
+const std::string &arm_compute::string_from_non_linear_filter_function(NonLinearFilterFunction function)
+{
+    static std::map<NonLinearFilterFunction, const std::string> func_map =
+    {
+        { NonLinearFilterFunction::MAX, "MAX" },
+        { NonLinearFilterFunction::MEDIAN, "MEDIAN" },
+        { NonLinearFilterFunction::MIN, "MIN" },
+    };
+
+    return func_map[function];
+}
+
+const std::string &arm_compute::string_from_interpolation_policy(InterpolationPolicy policy)
+{
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
+    {
+        { InterpolationPolicy::AREA, "AREA" },
+        { InterpolationPolicy::BILINEAR, "BILINEAR" },
+        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    };
+
+    return interpolation_policy_map[policy];
+}
+
+const std::string &arm_compute::string_from_border_mode(BorderMode border_mode)
+{
+    static std::map<BorderMode, const std::string> border_mode_map =
+    {
+        { BorderMode::UNDEFINED, "UNDEFINED" },
+        { BorderMode::CONSTANT, "CONSTANT" },
+        { BorderMode::REPLICATE, "REPLICATE" },
+    };
+
+    return border_mode_map[border_mode];
+}
+
+const std::string &arm_compute::string_from_norm_type(NormType type)
+{
+    static std::map<NormType, const std::string> norm_type_map =
+    {
+        { NormType::IN_MAP_1D, "IN_MAP_1D" },
+        { NormType::IN_MAP_2D, "IN_MAP_2D" },
+        { NormType::CROSS_MAP, "CROSS_MAP" },
+    };
+
+    return norm_type_map[type];
+}
+
+std::string arm_compute::lower_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+    return res;
+}
+
+const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
+                                                                           unsigned int stride_x, unsigned int stride_y,
+                                                                           unsigned int pad_x, unsigned int pad_y,
+                                                                           DimensionRoundingType round_type)
+{
+    unsigned int w = 0;
+    unsigned int h = 0;
+    switch(round_type)
+    {
+        case DimensionRoundingType::FLOOR:
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            break;
+        case DimensionRoundingType::CEIL:
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported rounding type");
+    }
+
+    // Make sure that border operations will start from inside the input and not the padded area
+    if(((w - 1) * stride_x) >= (width + pad_x))
+    {
+        --w;
+    }
+    if(((h - 1) * stride_y) >= (height + pad_y))
+    {
+        --h;
+    }
+    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_x));
+    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_y));
+
+    return std::make_pair(w, h);
+}
+
+void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
+            break;
+        case DataType::QS8:
+        case DataType::S8:
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::U16:
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::S16:
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::U32:
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::S32:
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::F32:
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::F16:
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+    }
+}
+
+int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
+        case DataType::QS8:
+        case DataType::S8:
+            return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
+        case DataType::U16:
+            return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
+        case DataType::S16:
+            return max_consecutive_elements_display_width_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n);
+        case DataType::U32:
+            return max_consecutive_elements_display_width_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n);
+        case DataType::S32:
+            return max_consecutive_elements_display_width_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n);
+        case DataType::F32:
+            return max_consecutive_elements_display_width_impl<float>(s, reinterpret_cast<const float *>(ptr), n);
+        case DataType::F16:
+            return 0;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+    }
+}
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
new file mode 100644
index 0000000000..ae2841d7a4
--- /dev/null
+++ b/src/core/Validate.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Validate.h"
+
+void arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                               const arm_compute::Window &full, const arm_compute::Window &win)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    full.validate();
+    win.validate();
+
+    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != win[i].step(), function, file, line);
+    }
+}
+
+void arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                             const arm_compute::Window &full, const arm_compute::Window &sub)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    full.validate();
+    sub.validate();
+
+    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != sub[i].step(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC((sub[i].start() - full[i].start()) % sub[i].step(), function, file, line);
+    }
+}
+
+void arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                                      const arm_compute::Coordinates &pos, unsigned int max_dim)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(pos);
+
+    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(pos[i] != 0, function, file, line);
+    }
+}
+
+void arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                                 const arm_compute::Window &win, unsigned int max_dim)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(win);
+
+    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
+                                     function, file, line,
+                                     "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+    }
+}
+
+void arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                                         const arm_compute::ITensor *tensor)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor->info()->num_dimensions() != 2,
+                                 function, file, line,
+                                 "Only 2D Tensors are supported by this kernel (%d passed)", tensor->info()->num_dimensions());
+}
+
+void arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                                       arm_compute::Format fmt, arm_compute::Channel cn)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
+
+    switch(fmt)
+    {
+        case arm_compute::Format::RGB888:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            break;
+        case arm_compute::Format::RGBA8888:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            break;
+        case arm_compute::Format::UV88:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            break;
+        case arm_compute::Format::IYUV:
+        case arm_compute::Format::UYVY422:
+        case arm_compute::Format::YUYV422:
+        case arm_compute::Format::NV12:
+        case arm_compute::Format::NV21:
+        case arm_compute::Format::YUV444:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            break;
+        default:
+            ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
+    }
+}
+
+void arm_compute::error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                             const arm_compute::IMultiHOG *multi_hog)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    ARM_COMPUTE_ERROR_ON_LOC(nullptr == multi_hog, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(0 == multi_hog->num_models(), function, file, line);
+
+    for(size_t i = 1; i < multi_hog->num_models(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->phase_type() != multi_hog->model(i)->info()->phase_type(),
+                                     function, file, line,
+                                     "All HOG parameters must have the same phase type");
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->normalization_type() != multi_hog->model(i)->info()->normalization_type(),
+                                     function, file, line,
+                                     "All HOG parameters must have the same normalization type");
+        ARM_COMPUTE_ERROR_ON_LOC_MSG((multi_hog->model(0)->info()->l2_hyst_threshold() != multi_hog->model(i)->info()->l2_hyst_threshold())
+                                     && (multi_hog->model(0)->info()->normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
+                                     function, file, line,
+                                     "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
+    }
+}
+
+void arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                               const arm_compute::IKernel *kernel)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(kernel);
+
+    ARM_COMPUTE_ERROR_ON_LOC(kernel == nullptr, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0),
+                                 function, file, line,
+                                 "This kernel hasn't been configured.");
+}
+
+void arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_shape);
+    ARM_COMPUTE_UNUSED(coords);
+    ARM_COMPUTE_UNUSED(shape);
+
+    // Subtensor should not index in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+    // Subtensor shape should match parent tensor in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+
+    // Check dimensions
+    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
+                                 function, file, line);
+    }
+}
+
+void arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                                          const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_valid_region);
+    ARM_COMPUTE_UNUSED(valid_region);
+
+    // Check valid regions
+    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+                                 function, file, line);
+    }
+}
diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp
new file mode 100644
index 0000000000..f1dd95e77e
--- /dev/null
+++ b/src/runtime/CL/CLDistribution1D.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDistribution1D::CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : ICLDistribution1D(num_bins, offset, range), _mem(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, num_bins * sizeof(int32_t))
+{
+}
+
+void CLDistribution1D::map(bool blocking)
+{
+    ICLDistribution1D::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLDistribution1D::unmap()
+{
+    ICLDistribution1D::unmap(CLScheduler::get().queue());
+}
+
+uint32_t *CLDistribution1D::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    return static_cast<uint32_t *>(q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLDistribution1D::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    q.enqueueUnmapMemObject(_mem, _mapping);
+}
+
+cl::Buffer &CLDistribution1D::cl_buffer()
+{
+    return _mem;
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000000..3f5266ce70
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+    : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _info   = input;
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+    return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+    return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+    ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+    ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, descriptor());
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp
new file mode 100644
index 0000000000..a8cbf2131f
--- /dev/null
+++ b/src/runtime/CL/CLLut.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLut.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+CLLut::CLLut()
+    : _allocator()
+{
+}
+
+CLLut::CLLut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t CLLut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t CLLut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t CLLut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType CLLut::type() const
+{
+    return _allocator.type();
+}
+
+const cl::Buffer &CLLut::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+void CLLut::clear()
+{
+    cl::CommandQueue &q    = CLScheduler::get().queue();
+    uint8_t          *data = _allocator.map(q, true /* blocking */);
+    std::memset(data, 0, size_in_bytes());
+    _allocator.unmap(q, data);
+}
+
+ILutAllocator *CLLut::allocator()
+{
+    return &_allocator;
+}
+
+void CLLut::map(bool blocking)
+{
+    ICLLut::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLLut::unmap()
+{
+    ICLLut::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLLut::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLLut::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp
new file mode 100644
index 0000000000..311de4bb8d
--- /dev/null
+++ b/src/runtime/CL/CLLutAllocator.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLutAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLLutAllocator::CLLutAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLLutAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLLutAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+uint8_t *CLLutAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLLutAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
+
+void CLLutAllocator::allocate()
+{
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size());
+}
+
+uint8_t *CLLutAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    _mapping           = map(q, true);
+    return _mapping;
+}
+
+void CLLutAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    unmap(q, _mapping);
+    _mapping = nullptr;
+}
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000000..b9e8739454
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
new file mode 100644
index 0000000000..63059cb5f4
--- /dev/null
+++ b/src/runtime/CL/CLMultiImage.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLMultiImage::CLMultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *CLMultiImage::info() const
+{
+    return &_info;
+}
+
+void CLMultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void CLMultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void CLMultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+CLImage *CLMultiImage::cl_plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const CLImage *CLMultiImage::cl_plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
new file mode 100644
index 0000000000..41d81ea0f8
--- /dev/null
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <array>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLPyramid::CLPyramid()
+    : _info(), _pyramid(nullptr)
+{
+}
+
+void CLPyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void CLPyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const std::array<float, 4> c_orbscale =
+    {
+        {
+            0.5f,
+            SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+        }
+    };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        _pyramid[i].allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            const float orb_scale = c_orbscale[(i + 1) % 4];
+            w                     = std::ceil(ref_w * orb_scale);
+            h                     = std::ceil(ref_h * orb_scale);
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void CLPyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *CLPyramid::info() const
+{
+    return &_info;
+}
+
+CLTensor *CLPyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
new file mode 100644
index 0000000000..fe25ce534c
--- /dev/null
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+using namespace arm_compute;
+
+CLScheduler::CLScheduler()
+    : _context(), _queue(), _target(GPUTarget::MIDGARD)
+{
+}
+
+CLScheduler &CLScheduler::get()
+{
+    static CLScheduler scheduler;
+    return scheduler;
+}
+
+void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+{
+    kernel.run(kernel.window(), _queue);
+
+    if(flush)
+    {
+        _queue.flush();
+    }
+}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000000..b228c0abda
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+    return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+    return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
new file mode 100644
index 0000000000..eefa0331d5
--- /dev/null
+++ b/src/runtime/CL/CLTensor.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensor::CLTensor()
+    : _allocator()
+{
+}
+
+TensorInfo *CLTensor::info() const
+{
+    return &_allocator.info();
+}
+
+TensorInfo *CLTensor::info()
+{
+    return &_allocator.info();
+}
+
+const cl::Buffer &CLTensor::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+ITensorAllocator *CLTensor::allocator()
+{
+    return &_allocator;
+}
+
+void CLTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLTensor::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
new file mode 100644
index 0000000000..8112a7148f
--- /dev/null
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensorAllocator::CLTensorAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLTensorAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLTensorAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+void CLTensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    info().set_is_resizable(false);
+}
+
+void CLTensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+    info().set_is_resizable(true);
+}
+
+uint8_t *CLTensorAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    _mapping = map(CLScheduler::get().queue(), true);
+    return _mapping;
+}
+
+void CLTensorAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    unmap(CLScheduler::get().queue(), _mapping);
+    _mapping = nullptr;
+}
+
+uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+}
+
+void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
new file mode 100644
index 0000000000..aa45743d37
--- /dev/null
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+ICLSimpleFunction::ICLSimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void ICLSimpleFunction::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
+
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_kernel);
+}
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
new file mode 100644
index 0000000000..5097dd4710
--- /dev/null
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
new file mode 100644
index 0000000000..56c519984c
--- /dev/null
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
+
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+    k->configure(input, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+    k->configure(input, alpha, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+    k->configure(input, shift, accum);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
new file mode 100644
index 0000000000..9b5bd8b663
--- /dev/null
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+    k->configure(input, output, act_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
new file mode 100644
index 0000000000..36bff4285c
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..97f0a1caf4
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..3df673c6a6
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
new file mode 100644
index 0000000000..7c85043206
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
new file mode 100644
index 0000000000..17ae5dea3c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
new file mode 100644
index 0000000000..c84a279bae
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
new file mode 100644
index 0000000000..fd49c7d818
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
new file mode 100644
index 0000000000..8de6807c73
--- /dev/null
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
new file mode 100644
index 0000000000..1d018b8347
--- /dev/null
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+using namespace arm_compute;
+
+CLCannyEdge::CLCannyEdge()
+    : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+{
+}
+
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+
+    const unsigned int L1_hysteresis_stack_size = 8;
+    const TensorShape  shape                    = input->info()->tensor_shape();
+
+    TensorInfo gradient_info;
+    TensorInfo info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S16);
+        info.init(shape, 1, arm_compute::DataType::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S32);
+        info.init(shape, 1, arm_compute::DataType::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _mag.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
+    _phase.allocator()->init(info_u8);
+    _l1_list_counter.allocator()->init(info_u8);
+
+    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
+    _visited.allocator()->init(info_u32);
+    _recorded.allocator()->init(info_u32);
+
+    TensorShape shape_l1_stack = input->info()->tensor_shape();
+    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
+    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
+    _l1_stack.allocator()->init(info_s32);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+    }
+
+    // Configure gradient
+    _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _mag.allocator()->allocate();
+    _visited.allocator()->allocate();
+    _recorded.allocator()->allocate();
+    _l1_stack.allocator()->allocate();
+    _l1_list_counter.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLCannyEdge::run()
+{
+    // Run sobel
+    _sobel->run();
+
+    // Run phase and magnitude calculation
+    CLScheduler::get().enqueue(_gradient, false);
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    CLScheduler::get().enqueue(_border_mag_gradient, false);
+
+    // Run non max suppresion
+    _nonmax.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_non_max_suppr, false);
+
+    // Clear temporary structures and run edge trace
+    _visited.clear(CLScheduler::get().queue());
+    _recorded.clear(CLScheduler::get().queue());
+    _l1_list_counter.clear(CLScheduler::get().queue());
+    _l1_stack.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_edge_trace, true);
+}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
new file mode 100644
index 0000000000..79a3676bd7
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
new file mode 100644
index 0000000000..2c6174b9ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
new file mode 100644
index 0000000000..2fe465aeb8
--- /dev/null
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
new file mode 100644
index 0000000000..21b5d47679
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    int16_t conv_col[matrix_size];
+    int16_t conv_row[matrix_size];
+    _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+
+    if(_is_separable)
+    {
+        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+        // Allocate intermediate buffer
+        _tmp.allocator()->allocate();
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   CLConvolutionSquare<matrix_size>::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+
+    if(_is_separable)
+    {
+        CLScheduler::get().enqueue(_kernel_hor, false);
+        CLScheduler::get().enqueue(_kernel_vert);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_kernel);
+    }
+}
+
+template class arm_compute::CLConvolutionSquare<5>;
+template class arm_compute::CLConvolutionSquare<7>;
+template class arm_compute::CLConvolutionSquare<9>;
+
+void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
new file mode 100644
index 0000000000..f0bbc3514f
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        const DataType     dt = weights->info()->data_type();
+        TensorInfo         info_wr(shape_wr, 1, dt);
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    if(_transpose1xW)
+    {
+        CLScheduler::get().enqueue(_weights_transposed_kernel);
+    }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    // Create tensor to store the reshaped weights
+    size_t mat_weights_cols = weights->info()->dimension(3);
+    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+            weights = &_weights_reshaped;
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
+            _weights_transposed.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+            weights = &_weights_transposed;
+        }
+    }
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+
+    if(!_are_weights_reshaped)
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            _weights_transposed.allocator()->allocate();
+        }
+        else
+        {
+            _weights_reshaped.allocator()->allocate();
+        }
+    }
+
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void CLConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+    if(!_is_fully_connected_convolution)
+    {
+        CLScheduler::get().enqueue(_input_interleave_kernel);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000000..d967d9865f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void CLDepthConcatenate::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
new file mode 100644
index 0000000000..edcd4928ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
new file mode 100644
index 0000000000..c51cb4c333
--- /dev/null
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
new file mode 100644
index 0000000000..345f47763c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDilate.h"
+
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
new file mode 100644
index 0000000000..3b182d31b6
--- /dev/null
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
+
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <numeric>
+
+using namespace arm_compute;
+
+namespace
+{
+void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
+{
+    dist.map(true);
+    cum_dist.map(true);
+    lut.map(true);
+
+    const uint32_t *dist_ptr     = dist.buffer();
+    uint32_t       *cum_dist_ptr = cum_dist.buffer();
+    uint8_t        *lut_ptr      = lut.buffer();
+
+    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
+
+    // Calculate cumulative distribution
+    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
+
+    // Get the number of pixels that have the lowest value in the input image
+    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
+    {
+        return v > 0;
+    });
+    const size_t image_size = cum_dist_ptr[255];
+
+    if(image_size == num_lowest_pixels)
+    {
+        std::iota(lut_ptr, lut_ptr + 256, 0);
+    }
+    else
+    {
+        const float diff = image_size - num_lowest_pixels;
+
+        for(size_t i = 0; i < 256; ++i)
+        {
+            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
+        }
+    }
+
+    dist.unmap();
+    cum_dist.unmap();
+    lut.unmap();
+}
+} // namespace
+
+CLEqualizeHistogram::CLEqualizeHistogram()
+    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
+{
+    _histogram_kernel.configure(input, &_hist);
+    _border_histogram_kernel.configure(input, &_hist);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void CLEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    CLScheduler::get().enqueue(_histogram_kernel, false);
+
+    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
+    CLScheduler::get().enqueue(_border_histogram_kernel, false);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
+
+    // Map input to output using created LUT.
+    CLScheduler::get().enqueue(_map_histogram_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
new file mode 100644
index 0000000000..b4c50e465a
--- /dev/null
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLErode.h"
+
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
new file mode 100644
index 0000000000..d2903fb849
--- /dev/null
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <algorithm>
+#include <cstring>
+
+using namespace arm_compute;
+
+CLFastCorners::CLFastCorners()
+    : _fast_corners_kernel(),
+      _suppr_func(),
+      _copy_array_kernel(),
+      _output(),
+      _suppr(),
+      _win(),
+      _non_max(false),
+      _num_corners(nullptr),
+      _num_buffer(),
+      _corners(nullptr),
+      _constant_border_value(0)
+{
+}
+
+void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *const corners,
+                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
+    _output.allocator()->init(tensor_info);
+
+    _non_max               = nonmax_suppression;
+    _num_corners           = num_corners;
+    _corners               = corners;
+    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+    _constant_border_value = constant_border_value;
+
+    const bool update_number = (nullptr != _num_corners);
+
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
+
+    if(!_non_max)
+    {
+        _copy_array_kernel.configure(&_output, update_number, corners, &_num_buffer);
+    }
+    else
+    {
+        _suppr.allocator()->init(tensor_info);
+
+        _suppr_func.configure(&_output, &_suppr, border_mode);
+        _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
+
+        _suppr.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void CLFastCorners::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    if(_non_max)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
+        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
+        memset(out_buffer, 0, _output.info()->total_size());
+        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
+    }
+
+    CLScheduler::get().enqueue(_fast_corners_kernel, false);
+
+    if(_non_max)
+    {
+        _suppr_func.run();
+    }
+
+    CLScheduler::get().enqueue(_copy_array_kernel, false);
+
+    unsigned int get_num_corners = 0;
+    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
+
+    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
+
+    _corners->resize(corner_size);
+
+    if(_num_corners != nullptr)
+    {
+        *_num_corners = get_num_corners;
+    }
+
+    q.flush();
+}
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
new file mode 100644
index 0000000000..9e59b771d8
--- /dev/null
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
+    k->configure(tensor, border_width, border_mode, constant_border_value);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..57d57d517f
--- /dev/null
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+    }
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_transpose1xW_kernel);
+    }
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ICLTensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void CLFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if it comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        CLScheduler::get().enqueue(_im2col_kernel, false);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_interleave4x4_kernel, false);
+    }
+
+    // Run matrix multiply
+    CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        CLScheduler::get().enqueue(_accumulate_biases_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
new file mode 100644
index 0000000000..7408054127
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLGEMM::CLGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if(a->info()->dimension(1) != 1)
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        if(DataType::F32 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+        }
+        else if(DataType::F16 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("DataType not supported");
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiply kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+
+        // Allocate intermediate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+    else // The first input tensor is a vector
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, output, alpha);
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+void CLGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        CLScheduler::get().enqueue(_interleave_kernel, false);
+
+        // Run transpose kernel
+        CLScheduler::get().enqueue(_transpose_kernel, false);
+    }
+
+    // Run matrix multiply kernel
+    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        CLScheduler::get().enqueue(_ma_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..9dc77156ef
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
new file mode 100644
index 0000000000..45e011d8ce
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowp::CLGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    // Create shape for interleaved temporary tensor
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+
+    // Create shape for tranposed temporary tensor
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_b.allocator()->init(info_b);
+
+    // Configure kernels
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    // Allocate intermediate buffers
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void CLGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    CLScheduler::get().enqueue(_interleave_kernel, false);
+
+    /* Run transpose kernel */
+    CLScheduler::get().enqueue(_transpose_kernel, false);
+
+    /* Run matrix multiply kernel */
+    CLScheduler::get().enqueue(_mm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
new file mode 100644
index 0000000000..362a3fe920
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
new file mode 100644
index 0000000000..e83a8fb857
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+CLGaussian5x5::CLGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+{
+}
+
+void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
+
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+}
+
+void CLGaussian5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_kernel_hor, false);
+    CLScheduler::get().enqueue(_kernel_vert);
+}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
new file mode 100644
index 0000000000..8a4279e99b
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLGaussianPyramid::CLGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+CLGaussianPyramidHalf::CLGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        CLScheduler::get().enqueue(_border_handler[i], false);
+        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(_vertical_reduction[i], false);
+    }
+}
+
+CLGaussianPyramidOrb::CLGaussianPyramidOrb()
+    : _gauss5x5(), _scale_nearest()
+{
+}
+
+void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gauss5x5      = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure gaussian 5x5 */
+            _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED);
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gauss5x5[i].run();
+        CLScheduler::get().enqueue(_scale_nearest[i]);
+    }
+}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000000..b1b5a03ac1
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Intitialize tensors for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning
+    CLScheduler::get().enqueue(_orient_bin, false);
+
+    // Run block normalization
+    CLScheduler::get().enqueue(_block_norm);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000000..8eb5e4251f
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    _detection_windows = detection_windows;
+
+    // Allocate buffer for storing the number of detected objects
+    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+    // Configure HOGDetectorKernel
+    _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset number of detections
+    const unsigned int init_num_detection_windows = _detection_windows->num_values();
+    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+    // Run CLHOGDetectorKernel
+    CLScheduler::get().enqueue(_hog_detector_kernel);
+
+    // Read number of detections
+    unsigned int num_detection_windows = 0;
+    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+    // Update the number of values stored in _detection_windows
+    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+    q.flush();
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000000..2387474358
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+    : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+    }
+    else
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    CLScheduler::get().enqueue(_mag_phase);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000000..b8f2224ac8
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure CLTensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    detection_window_strides->map(CLScheduler::get().queue(), true);
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    detection_window_strides->unmap(CLScheduler::get().queue());
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void CLHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        // Map detection windows array before computing non maxima suppression
+        _detection_windows->map(CLScheduler::get().queue(), true);
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        _detection_windows->unmap(CLScheduler::get().queue());
+    }
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
new file mode 100644
index 0000000000..2db277fa4d
--- /dev/null
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+CLHarrisCorners::CLHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+      _corners(nullptr)
+{
+}
+
+void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+
+    _corners = corners;
+
+    const TensorShape shape = input->info()->tensor_shape();
+    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
+    TensorInfo        tensor_info(shape, 1, dt);
+    _gx.allocator()->init(tensor_info);
+    _gy.allocator()->init(tensor_info);
+
+    TensorInfo info_f32(shape, 1, DataType::F32);
+    _score.allocator()->init(info_f32);
+    _nonmax.allocator()->init(info_f32);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    /* Set/init Sobel kernel accordingly with gradient_size */
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
+    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
+
+    // Normalization factor
+    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+    const float pow4_normalization_factor = pow(norm_factor, 4);
+
+    // Set/init Harris Score kernel accordingly with block_size
+    _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+
+    // Allocate intermediate buffers
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    CLScheduler::get().enqueue(_border_gx, false);
+    CLScheduler::get().enqueue(_border_gy, false);
+
+    // Run harris score kernel
+    CLScheduler::get().enqueue(_harris_score, false);
+
+    // Run non-maxima suppression
+    CLScheduler::get().enqueue(_non_max_suppr);
+
+    // Run corner candidate kernel
+    _nonmax.map(true);
+    Scheduler::get().schedule(&_candidates, Window::DimY);
+    _nonmax.unmap();
+
+    _corners->map(CLScheduler::get().queue(), true);
+    _sort_euclidean.run(_sort_euclidean.window());
+    _corners->unmap(CLScheduler::get().queue());
+}
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
new file mode 100644
index 0000000000..eb543387f5
--- /dev/null
+++ b/src/runtime/CL/functions/CLHistogram.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHistogram::CLHistogram()
+    : _kernel(), _kernel_border()
+{
+}
+
+void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    _kernel.configure(input, output);
+    _kernel_border.configure(input, output);
+}
+
+void CLHistogram::run()
+{
+    CLScheduler::get().enqueue(_kernel, false);
+    CLScheduler::get().enqueue(_kernel_border);
+}
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
new file mode 100644
index 0000000000..2d54be32fa
--- /dev/null
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
+
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLIntegralImage::CLIntegralImage()
+    : _integral_hor(), _integral_vert()
+{
+}
+
+void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
+{
+    _integral_hor.configure(input, output);
+    _integral_vert.configure(output);
+}
+
+void CLIntegralImage::run()
+{
+    CLScheduler::get().enqueue(_integral_hor, false);
+    CLScheduler::get().enqueue(_integral_vert);
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
new file mode 100644
index 0000000000..d7ce20653d
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+using namespace arm_compute;
+
+CLLaplacianPyramid::CLLaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+{
+}
+
+void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
+
+void CLLaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    _gaussian_pyr_function.run(); // compute gaussian pyramid
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].run(); // convolute gaussian pyramid
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _subf[i].run(); // compute laplacian image
+    }
+
+    _depth_function.run();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
new file mode 100644
index 0000000000..1dfab740d7
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLLaplacianReconstruct::CLLaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void CLLaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..263fb51987
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+
+    // Runs vector matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
new file mode 100644
index 0000000000..51088cb71f
--- /dev/null
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, output, nullptr, mag_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
new file mode 100644
index 0000000000..56ba146790
--- /dev/null
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLMeanStdDev::CLMeanStdDev()
+    : _mean_stddev_kernel(),
+      _global_sum(),
+      _global_sum_squared()
+{
+}
+
+void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+{
+    _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+
+    if(stddev != nullptr)
+    {
+        _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+    }
+
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void CLMeanStdDev::run()
+{
+    CLScheduler::get().enqueue(_mean_stddev_kernel);
+}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
new file mode 100644
index 0000000000..0c10f9aa08
--- /dev/null
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
new file mode 100644
index 0000000000..ad783d8a53
--- /dev/null
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+using namespace arm_compute;
+
+CLMinMaxLocation::CLMinMaxLocation()
+    : _min_max_kernel(),
+      _min_max_loc_kernel(),
+      _min_max_vals(),
+      _min_max_count_vals(),
+      _min(nullptr),
+      _max(nullptr),
+      _min_count(nullptr),
+      _max_count(nullptr),
+      _min_loc(nullptr),
+      _max_loc(nullptr)
+{
+}
+
+void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
+    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
+    _min                = min;
+    _max                = max;
+    _min_count          = min_count;
+    _max_count          = max_count;
+    _min_loc            = min_loc;
+    _max_loc            = max_loc;
+
+    _min_max_kernel.configure(input, &_min_max_vals);
+    _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+}
+
+void CLMinMaxLocation::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    CLScheduler::get().enqueue(_min_max_kernel, false);
+    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
+
+    // Update min and max
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+
+    // Update min and max count
+    if(_min_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
+    }
+    if(_max_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
+    }
+
+    // Update min/max point arrays (Makes the kernel blocking)
+    if(_min_loc != nullptr)
+    {
+        unsigned int min_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
+        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
+        _min_loc->resize(min_corner_size);
+    }
+    if(_max_loc != nullptr)
+    {
+        unsigned int max_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
+        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
+        _max_loc->resize(max_corner_size);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
new file mode 100644
index 0000000000..b593a6cced
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
+
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..ca7d5aede7
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT);
+    }
+    else
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
new file mode 100644
index 0000000000..2d89ebd676
--- /dev/null
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayer::CLNormalizationLayer()
+    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+{
+}
+
+void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+
+    _norm_kernel.configure(input, &_squared_input, output, norm_info);
+    _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate buffers
+    _squared_input.allocator()->allocate();
+}
+
+void CLNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_multiply_kernel, false);
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_norm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
new file mode 100644
index 0000000000..a6b0eb3bec
--- /dev/null
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+using namespace arm_compute;
+
+CLOpticalFlow::CLOpticalFlow()
+    : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
+      _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+{
+}
+
+void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    // Set member variables
+    _old_points           = old_points;
+    _new_points_estimates = new_points_estimates;
+    _new_points           = new_points;
+    _num_levels           = old_pyramid->info()->num_levels();
+
+    const float pyr_scale              = old_pyramid->info()->scale();
+    const int   list_length            = old_points->num_values();
+    const int   old_values_list_length = list_length * window_dimension * window_dimension;
+
+    // Create kernels and tensors
+    _tracker_init_kernel   = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+    _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+    _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+    _func_scharr           = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+    _scharr_gx             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _scharr_gy             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+
+    // Create internal keypoint arrays
+    _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _old_points_internal->resize(list_length);
+    _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _new_points_internal->resize(list_length);
+    _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+    _coefficient_table->resize(list_length);
+    _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+    _old_values->resize(old_values_list_length);
+    _new_points->resize(list_length);
+
+    for(size_t i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        // Initialize Scharr tensors
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
+
+        // Init Lucas-Kanade init kernel
+        _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+
+        // Init Lucas-Kanade stage0 kernel
+        _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            window_dimension, i);
+
+        // Init Lucas-Kanade stage1 kernel
+        _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            termination, epsilon, num_iterations, window_dimension, i);
+
+        // Allocate intermediate buffers
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+
+    // Finalize Lucas-Kanade
+    _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points);
+}
+
+void CLOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade init kernel
+        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage0 kernel
+        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage1 kernel
+        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
+    }
+
+    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
new file mode 100644
index 0000000000..a8cb22b06e
--- /dev/null
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPhase.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..8a86c2e203
--- /dev/null
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
new file mode 100644
index 0000000000..1ef70f4a2b
--- /dev/null
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
new file mode 100644
index 0000000000..f6b1713c58
--- /dev/null
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRemap.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+    k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
new file mode 100644
index 0000000000..043f873028
--- /dev/null
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(output == input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
new file mode 100644
index 0000000000..c8bc465be6
--- /dev/null
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
new file mode 100644
index 0000000000..6b74ebaedb
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
new file mode 100644
index 0000000000..098b546c1a
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel5x5::CLSobel5x5()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
new file mode 100644
index 0000000000..db84fa99ae
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel7x7::CLSobel7x7()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel7x7::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
new file mode 100644
index 0000000000..2a78c58053
--- /dev/null
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSoftmaxLayer::CLSoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+
+    // Create intermediate tensors shapes
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void CLSoftmaxLayer::run()
+{
+    CLScheduler::get().enqueue(_max_kernel, false);
+    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
new file mode 100644
index 0000000000..743ed5e73e
--- /dev/null
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
new file mode 100644
index 0000000000..e70f932d66
--- /dev/null
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
new file mode 100644
index 0000000000..d802b4fe77
--- /dev/null
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
new file mode 100644
index 0000000000..537e0d9397
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
new file mode 100644
index 0000000000..a552ab480d
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
new file mode 100644
index 0000000000..886933074d
--- /dev/null
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <iostream>
+#include <semaphore.h>
+#include <system_error>
+#include <thread>
+
+using namespace arm_compute;
+
+class arm_compute::Thread
+{
+public:
+    /** Start a new thread
+     */
+    Thread();
+    Thread(const Thread &) = delete;
+    Thread &operator=(const Thread &) = delete;
+    Thread(Thread &&)                 = delete;
+    Thread &operator=(Thread &&) = delete;
+    /** Make the thread join
+     */
+    ~Thread();
+    /** Request the worker thread to start executing the given kernel
+     * This function will return as soon as the kernel has been sent to the worker thread.
+     * wait() needs to be called to ensure the execution is complete.
+     */
+    void start(ICPPKernel *kernel, const Window &window);
+    /** Wait for the current kernel execution to complete
+     */
+    void wait();
+    /** Function ran by the worker thread
+     */
+    void worker_thread();
+
+private:
+    std::thread        _thread;
+    ICPPKernel        *_kernel{ nullptr };
+    Window             _window;
+    sem_t              _wait_for_work;
+    sem_t              _job_complete;
+    std::exception_ptr _current_exception;
+};
+
+Thread::Thread()
+    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+{
+    int ret = sem_init(&_wait_for_work, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_init(&_job_complete, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    _thread = std::thread(&Thread::worker_thread, this);
+}
+
+Thread::~Thread()
+{
+    ARM_COMPUTE_ERROR_ON(!_thread.joinable());
+
+    start(nullptr, Window());
+    _thread.join();
+
+    int ret = sem_destroy(&_wait_for_work);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_destroy(&_job_complete);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+}
+
+void Thread::start(ICPPKernel *kernel, const Window &window)
+{
+    _kernel = kernel;
+    _window = window;
+    int ret = sem_post(&_wait_for_work);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+}
+
+void Thread::wait()
+{
+    int ret = sem_wait(&_job_complete);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    if(_current_exception)
+    {
+        std::rethrow_exception(_current_exception);
+    }
+}
+
+void Thread::worker_thread()
+{
+    while(sem_wait(&_wait_for_work) >= 0)
+    {
+        _current_exception = nullptr;
+        // Time to exit
+        if(_kernel == nullptr)
+        {
+            return;
+        }
+
+        try
+        {
+            _window.validate();
+            _kernel->run(_window);
+        }
+        catch(...)
+        {
+            _current_exception = std::current_exception();
+        }
+        int ret = sem_post(&_job_complete);
+        ARM_COMPUTE_UNUSED(ret);
+        ARM_COMPUTE_ERROR_ON(ret < 0);
+    }
+
+    ARM_COMPUTE_ERROR("Wait failed");
+}
+
+namespace
+{
+void delete_threads(Thread *t)
+{
+    delete[] t;
+}
+} // namespace
+
+CPPScheduler &CPPScheduler::get()
+{
+    static CPPScheduler scheduler;
+    return scheduler;
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+CPPScheduler::CPPScheduler()
+    : _num_threads(std::thread::hardware_concurrency()),
+      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+{
+}
+
+void CPPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = std::thread::hardware_concurrency();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    /** [Scheduler example] */
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        for(unsigned int t = 0; t < num_threads; ++t)
+        {
+            Window win = max_window.split_window(split_dimension, t, num_threads);
+            win.set_thread_id(t);
+            win.set_num_threads(num_threads);
+
+            if(t != num_threads - 1)
+            {
+                _threads[t].start(kernel, win);
+            }
+            else
+            {
+                kernel->run(win);
+            }
+        }
+
+        try
+        {
+            for(unsigned int t = 1; t < num_threads; ++t)
+            {
+                _threads[t - 1].wait();
+            }
+        }
+        catch(const std::system_error &e)
+        {
+            std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+        }
+    }
+    /** [Scheduler example] */
+}
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
new file mode 100644
index 0000000000..f086813e91
--- /dev/null
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+SingleThreadScheduler &SingleThreadScheduler::get()
+{
+    static SingleThreadScheduler scheduler;
+    return scheduler;
+}
+
+void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
+{
+    ARM_COMPUTE_UNUSED(num_threads);
+}
+
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_UNUSED(split_dimension);
+    kernel->run(kernel->window());
+}
+
+unsigned int SingleThreadScheduler::num_threads() const
+{
+    return 1;
+}
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
new file mode 100644
index 0000000000..b06767499b
--- /dev/null
+++ b/src/runtime/Distribution1D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Distribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+{
+}
+
+uint32_t *Distribution1D::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == _data);
+    return _data.get();
+}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
new file mode 100644
index 0000000000..5d533dded4
--- /dev/null
+++ b/src/runtime/HOG.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/HOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+HOG::HOG()
+    : IHOG(), _info(), _descriptor(nullptr)
+{
+}
+
+void HOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
+    _info       = input;
+    _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+}
+
+float *HOG::descriptor() const
+{
+    return _descriptor.get();
+}
+
+const HOGInfo *HOG::info() const
+{
+    return &_info;
+}
diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp
new file mode 100644
index 0000000000..fb961638f1
--- /dev/null
+++ b/src/runtime/ILutAllocator.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ILutAllocator::ILutAllocator()
+    : _num_elements(0), _data_type(DataType::U8)
+{
+}
+
+void ILutAllocator::init(size_t num_elements, DataType data_type)
+{
+    // Init internal metadata
+    _num_elements = num_elements;
+    _data_type    = data_type;
+
+    // Allocate the image's memory
+    allocate();
+}
+
+size_t ILutAllocator::num_elements() const
+{
+    return _num_elements;
+}
+
+DataType ILutAllocator::type() const
+{
+    return _data_type;
+}
+
+size_t ILutAllocator::size() const
+{
+    return data_size_from_type(_data_type) * num_elements();
+}
diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
new file mode 100644
index 0000000000..8294201384
--- /dev/null
+++ b/src/runtime/ITensorAllocator.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ITensorAllocator::ITensorAllocator()
+    : _info()
+{
+}
+
+void ITensorAllocator::init(const TensorInfo &input)
+{
+    _info = input;
+}
+
+TensorInfo &ITensorAllocator::info()
+{
+    return _info;
+}
+
+const TensorInfo &ITensorAllocator::info() const
+{
+    return _info;
+}
diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp
new file mode 100644
index 0000000000..1b3daf1f60
--- /dev/null
+++ b/src/runtime/Lut.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Lut.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+Lut::Lut()
+    : _allocator()
+{
+}
+
+Lut::Lut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t Lut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t Lut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t Lut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType Lut::type() const
+{
+    return _allocator.type();
+}
+
+uint8_t *Lut::buffer() const
+{
+    return _allocator.data();
+}
+
+void Lut::clear()
+{
+    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
+    std::memset(this->buffer(), 0, this->size_in_bytes());
+}
+
+ILutAllocator *Lut::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
new file mode 100644
index 0000000000..17baf21f45
--- /dev/null
+++ b/src/runtime/LutAllocator.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/LutAllocator.h"
+
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+LutAllocator::LutAllocator()
+    : _buffer(nullptr)
+{
+}
+
+uint8_t *LutAllocator::data() const
+{
+    return _buffer.get();
+}
+
+void LutAllocator::allocate()
+{
+    _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+}
+
+uint8_t *LutAllocator::lock()
+{
+    return _buffer.get();
+}
+
+void LutAllocator::unlock()
+{
+}
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
new file mode 100644
index 0000000000..003dc93895
--- /dev/null
+++ b/src/runtime/MultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiHOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+using namespace arm_compute;
+
+MultiHOG::MultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+{
+}
+
+size_t MultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+IHOG *MultiHOG::model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const IHOG *MultiHOG::model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
new file mode 100644
index 0000000000..def1487c5e
--- /dev/null
+++ b/src/runtime/MultiImage.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+MultiImage::MultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *MultiImage::info() const
+{
+    return &_info;
+}
+
+void MultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void MultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void MultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void MultiImage::create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height)
+{
+    arm_compute::Format format = image->info()->format();
+    const TensorInfo    info(width, height, Format::U8);
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F32:
+        case Format::F16:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            const TensorInfo info_full(width, height, format);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            const TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            const TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_sub2);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(2))->allocator(), coords, info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+Image *MultiImage::plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const Image *MultiImage::plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
new file mode 100644
index 0000000000..6f0da85fc8
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+INESimpleFunction::INESimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void INESimpleFunction::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
new file mode 100644
index 0000000000..b39feb3a2b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
new file mode 100644
index 0000000000..c39abfc540
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAccumulate::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+}
+
+void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+    k->configure(input, shift, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
new file mode 100644
index 0000000000..f5d81d7cd8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+    k->configure(input, output, activation_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
new file mode 100644
index 0000000000..50cc38b489
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..a3d27c0ed6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..a24429c6de
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    // Configure kernel
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
new file mode 100644
index 0000000000..5aafc51dc0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
new file mode 100644
index 0000000000..af3df6e46a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
new file mode 100644
index 0000000000..d12c5e5f6f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
new file mode 100644
index 0000000000..65c943e64c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
new file mode 100644
index 0000000000..7f0b45d34c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
new file mode 100644
index 0000000000..26f31f557b
--- /dev/null
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstring>
+#include <utility>
+
+using namespace arm_compute;
+
+NECannyEdge::NECannyEdge()
+    : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+{
+}
+
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
+                            bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(gradient_size < 3);
+    ARM_COMPUTE_ERROR_ON(gradient_size > 7);
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+
+    _output = output;
+
+    const TensorShape &shape = input->info()->tensor_shape();
+    TensorInfo         gradient_info;
+    TensorInfo         magnitude_info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, Format::S16);
+        magnitude_info.init(shape, Format::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, Format::S32);
+        magnitude_info.init(shape, Format::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _magnitude.allocator()->init(magnitude_info);
+
+    TensorInfo info(shape, Format::U8);
+    _phase.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient size not supported\n");
+    }
+
+    // Configure gradient
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output);
+
+    // Fill border with "No edge" to stop recursion in edge trace
+    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NECannyEdge::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+
+    // Run sobelNxN
+    _sobel->run();
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    _border_mag_gradient.run(_border_mag_gradient.window());
+
+    // Run gradient
+    NEScheduler::get().schedule(_gradient.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
+
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->info()->total_size());
+
+    // Fill border before edge trace
+    _border_edge_trace.run(_border_edge_trace.window());
+
+    // Run edge tracing
+    _edge_trace.run(_edge_trace.window());
+}
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
new file mode 100644
index 0000000000..84d4fff4ff
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
new file mode 100644
index 0000000000..634e918eac
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
new file mode 100644
index 0000000000..bbaa832284
--- /dev/null
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEColorConvert::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
new file mode 100644
index 0000000000..3f39ae2cbd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <array>
+#include <utility>
+
+using namespace arm_compute;
+
+void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    std::array<int16_t, matrix_size> conv_col{ { 0 } };
+    std::array<int16_t, matrix_size> conv_row{ { 0 } };
+
+    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
+
+    if(_is_separable)
+    {
+        DataType intermediate_type = DataType::UNKNOWN;
+        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
+
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
+
+        _tmp.allocator()->allocate();
+
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   NEConvolutionSquare<matrix_size>::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    if(_is_separable)
+    {
+        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+    }
+    else
+    {
+        NEScheduler::get().schedule(&_kernel, Window::DimY);
+    }
+}
+
+template class arm_compute::NEConvolutionSquare<5>;
+template class arm_compute::NEConvolutionSquare<7>;
+template class arm_compute::NEConvolutionSquare<9>;
+
+void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
new file mode 100644
index 0000000000..bd688cffb6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    if(_transpose1xW)
+    {
+        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+    }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+        }
+        weights = &_weights_reshaped;
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void NEConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_is_fully_connected_convolution)
+    {
+        // Run interleave
+        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000000..7d2c5494a9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs             = inputs_vector.size();
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+    unsigned int depth_offset = 0;
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void NEDepthConcatenate::run()
+{
+    for(unsigned i = 0; i < _num_inputs; ++i)
+    {
+        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
new file mode 100644
index 0000000000..a339cae316
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
+
+    auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
new file mode 100644
index 0000000000..2887c13233
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDerivative::NEDerivative()
+    : _kernel(), _border_handler()
+{
+}
+
+void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
+
+void NEDerivative::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
new file mode 100644
index 0000000000..0c016f14f9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDilate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..3f3e7710fb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    // Allocate the intermediate accumulator tensor in case of fixed point input
+    if(output->info()->data_type() == DataType::QS8)
+    {
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        _accumulate_bias_kernel.configure(output, bias);
+    }
+
+    // Add zero padding XY
+    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+    _input_border_handler.run(_input_border_handler.window());
+
+    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
new file mode 100644
index 0000000000..f6ec677e44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEEqualizeHistogram::NEEqualizeHistogram()
+    : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void NEEqualizeHistogram::configure(const IImage *input, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Configure kernels
+    _histogram_kernel.configure(input, &_hist);
+    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void NEEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+
+    // Map input to output using created LUT.
+    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
new file mode 100644
index 0000000000..9b011db845
--- /dev/null
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEErode.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
new file mode 100644
index 0000000000..33a58f1904
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEFastCorners::NEFastCorners()
+    : _fast_corners_kernel(),
+      _border_handler(),
+      _nonmax_kernel(),
+      _fill_kernel(),
+      _output(),
+      _suppressed(),
+      _non_max(false)
+{
+}
+
+void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    _non_max = nonmax_suppression;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
+    _output.allocator()->init(tensor_info);
+
+    // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
+    // width - 3) and ywindow (3, height -3) so the output image will leave the
+    // pixels on the borders unchanged. This is reflected in the valid region
+    // of the output. The non maxima suppression is only run on the valid
+    // pixels.
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
+    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
+
+    if(!_non_max)
+    {
+        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
+    }
+    else
+    {
+        _suppressed.allocator()->init(tensor_info);
+        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
+        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+
+        // Allocate intermediate tensors
+        _suppressed.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void NEFastCorners::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
+
+    if(_non_max)
+    {
+        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
+    }
+
+    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
new file mode 100644
index 0000000000..e884f4a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
+
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    _border_handler.configure(input, border_width, border_mode, constant_border_value);
+}
+
+void NEFillBorder::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..abb41e9f70
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+    }
+}
+
+NEFullyConnectedLayer::NEFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ITensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void NEFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply
+    NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
new file mode 100644
index 0000000000..15d5f4effb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+NEGEMM::NEGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if((a->info()->dimension(1) == 1))
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, d, alpha);
+    }
+    else
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        switch(a->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+                break;
+            }
+            case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                {
+                    shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+                    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+                    break;
+                }
+#endif
+            case DataType::QS8:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR_ON("Data type not supported");
+            }
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiplication kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+        // Allocate once the all configure methods have been called
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, d, beta);
+        _run_addition = true;
+    }
+}
+
+void NEGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+        // Run transpose kernel
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply kernel
+    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..4c77c88656
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+using namespace arm_compute;
+
+void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
new file mode 100644
index 0000000000..b64f769459
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGEMMLowp::NEGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+    _tmp_b.allocator()->init(info_b);
+
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void NEGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+    /* Run transpose kernel */
+    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+
+    /* Run matrix multiply kernel */
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
new file mode 100644
index 0000000000..dc40ecec14
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
new file mode 100644
index 0000000000..95ba5cbdf9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
new file mode 100644
index 0000000000..5ccc765966
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGaussian5x5::NEGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+{
+}
+
+void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    // Init temporary buffer
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+    _tmp.allocator()->init(tensor_info);
+
+    // Create and configure kernels for the two passes
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+
+    _tmp.allocator()->allocate();
+
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEGaussian5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
new file mode 100644
index 0000000000..e1d64f11f6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NEGaussianPyramid::NEGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+NEGaussianPyramidHalf::NEGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+    }
+}
+
+NEGaussianPyramidOrb::NEGaussianPyramidOrb()
+    : _offsets(), _gaus5x5(), _scale_nearest()
+{
+}
+
+void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gaus5x5       = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
+        _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            const size_t width  = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
+            const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
+
+            /* Allocate Image for the offsets used by NEAREST interpolation */
+            TensorInfo tensor_info(TensorShape(width, height), Format::S32);
+            _offsets[i].allocator()->init(tensor_info);
+
+            /* Configure gaussian 5x5 */
+            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
+                                        border_mode == BorderMode::UNDEFINED);
+
+            _offsets[i].allocator()->allocate();
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gaus5x5[i].run();
+        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
new file mode 100644
index 0000000000..a592f53d44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGDescriptor::NEHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Allocate memory for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void NEHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning kernel
+    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
+
+    // Run block normalization kernel
+    NEScheduler::get().schedule(&_block_norm, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
new file mode 100644
index 0000000000..e8ed29d0b9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+
+using namespace arm_compute;
+
+void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+    k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
+    _kernel = std::move(k);
+}
+\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
new file mode 100644
index 0000000000..2f4b8802e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGGradient::NEHOGGradient()
+    : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+{
+}
+
+void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void NEHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
new file mode 100644
index 0000000000..173b8f4c42
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NEHOGMultiDetection::NEHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) NEHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of NEHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure NETensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void NEHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
new file mode 100644
index 0000000000..b54fb67ab7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+NEHarrisCorners::NEHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+{
+}
+
+void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+
+    const TensorShape shape = input->info()->tensor_shape();
+    TensorInfo        tensor_info_gxgy;
+
+    if(gradient_size < 7)
+    {
+        tensor_info_gxgy.init(shape, Format::S16);
+    }
+    else
+    {
+        tensor_info_gxgy.init(shape, Format::S32);
+    }
+
+    _gx.allocator()->init(tensor_info_gxgy);
+    _gy.allocator()->init(tensor_info_gxgy);
+
+    TensorInfo tensor_info_score(shape, Format::F32);
+    _score.allocator()->init(tensor_info_score);
+    _nonmax.allocator()->init(tensor_info_score);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    // Set/init Sobel kernel accordingly with gradient_size
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Normalization factor
+    const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+
+    if(use_fp16)
+    {
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+    else
+    {
+        // Set/init Harris Score kernel accordingly with block_size
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NEHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    _border_gx.run(_border_gx.window());
+    _border_gy.run(_border_gy.window());
+
+    // Run harris score kernel
+    NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    _non_max_suppr.run();
+
+    // Run corner candidate kernel
+    NEScheduler::get().schedule(&_candidates, Window::DimY);
+
+    // Run sort & euclidean distance
+    _sort_euclidean.run(_sort_euclidean.window());
+}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
new file mode 100644
index 0000000000..c42b2a56e0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHistogram::NEHistogram()
+    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+{
+}
+
+void NEHistogram::configure(const IImage *input, IDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Allocate space for threads local histograms
+    _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
+    _local_hist      = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+
+    // Configure kernel
+    _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+}
+
+void NEHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
new file mode 100644
index 0000000000..af604e9295
--- /dev/null
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEIntegralImage::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
new file mode 100644
index 0000000000..8232c79f2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NELaplacianPyramid::NELaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+{
+}
+
+void NELaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    // Compute Gaussian Pyramid
+    _gaussian_pyr_function.run();
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Apply Gaussian filter to gaussian pyramid image
+        _convf[i].run();
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Compute laplacian image
+        _subf[i].run();
+    }
+
+    _depth_function.run();
+}
+
+void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
new file mode 100644
index 0000000000..36ac4a74d1
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NELaplacianReconstruct::NELaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void NELaplacianReconstruct::configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void NELaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..85d7ba3650
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+    // Runs GEMM on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
new file mode 100644
index 0000000000..9390ca2b6a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
new file mode 100644
index 0000000000..47143f5e5b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMeanStdDev::NEMeanStdDev()
+    : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+{
+}
+
+void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+{
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void NEMeanStdDev::run()
+{
+    _global_sum         = 0;
+    _global_sum_squared = 0;
+
+    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
new file mode 100644
index 0000000000..aa7cc97081
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
new file mode 100644
index 0000000000..cab9200cf8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMinMaxLocation::NEMinMaxLocation()
+    : _min_max(), _min_max_loc()
+{
+}
+
+void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    _min_max.configure(input, min, max);
+    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
+}
+
+void NEMinMaxLocation::run()
+{
+    _min_max.reset();
+
+    /* Run min max kernel */
+    NEScheduler::get().schedule(&_min_max, Window::DimY);
+
+    /* Run min max location */
+    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
new file mode 100644
index 0000000000..01aea3b671
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode,
+                                  uint8_t    constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..a7b3759a45
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+    }
+    else
+    {
+        _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+    }
+}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
new file mode 100644
index 0000000000..69ff32591f
--- /dev/null
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NENormalizationLayer::NENormalizationLayer()
+    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+{
+}
+
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_squared.allocator()->init(tensor_info);
+
+    // Configure kernels
+    _norm_kernel.configure(input, &_input_squared, output, norm_info);
+    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f));
+
+    // Allocate the tensor once the configure methods have been called
+    _input_squared.allocator()->allocate();
+}
+
+void NENormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
new file mode 100644
index 0000000000..49135e442c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEOpticalFlow::NEOpticalFlow()
+    : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+      _num_levels(0)
+{
+}
+
+void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
+                              IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                              bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    _num_levels           = old_pyramid->info()->num_levels();
+    _old_points           = old_points;
+    _new_points           = new_points;
+    _new_points_estimates = new_points_estimates;
+
+    const float pyr_scale = old_pyramid->info()->scale();
+
+    _func_scharr    = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+    _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+    _scharr_gx      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+    _scharr_gy      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+
+    _old_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points->resize(old_points->num_values());
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        IImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        IImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
+
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+
+        // Init Lucas-Kanade kernel
+        _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
+                                     old_points, new_points_estimates, new_points,
+                                     &_old_points_internal, &_new_points_internal,
+                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                     i, _num_levels, pyr_scale);
+
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+}
+
+void NEOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade kernel
+        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
new file mode 100644
index 0000000000..7683f461d3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPhase.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+    k->configure(input1, input2, nullptr, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..056d33b370
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
new file mode 100644
index 0000000000..6f0cc4f160
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
new file mode 100644
index 0000000000..9f06fb699c
--- /dev/null
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+
+    k->configure(input, map_x, map_y, output, policy);
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
new file mode 100644
index 0000000000..b70f626df0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
+    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
+
+    if(dx != nullptr && dy != nullptr)
+    {
+        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
+        Iterator offsets_it(offsets, win);
+        Iterator dx_it(dx, win);
+        Iterator dy_it(dy, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
+            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const int   in_xi = std::floor(in_x);
+            const int   in_yi = std::floor(in_y);
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+        },
+        offsets_it, dx_it, dy_it);
+    }
+    else
+    {
+        // Pre-compute the offset for NEAREST interpolation
+        Iterator offsets_it(offsets, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const size_t in_xi = (id.x() + 0.5f) * wr;
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+        },
+        offsets_it);
+    }
+}
+} // namespace
+
+NEScale::NEScale()
+    : _offsets(), _dx(), _dy()
+{
+}
+
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    // Get the tensor shape
+    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    // Get the element size of the input image
+    const size_t input_element_size = input->info()->element_size();
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+
+    auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
+
+    // Check if the border mode is UNDEFINED
+    const bool border_undefined = border_mode == BorderMode::UNDEFINED;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            _offsets.allocator()->init(tensor_info_offsets);
+
+            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+
+            // Pre-compute offsets for nearest interpolation
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            TensorInfo tensor_info_dxdy(shape, Format::F32);
+
+            _offsets.allocator()->init(tensor_info_offsets);
+            _dx.allocator()->init(tensor_info_dxdy);
+            _dy.allocator()->init(tensor_info_dxdy);
+
+            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+            _dx.allocator()->allocate();
+            _dy.allocator()->allocate();
+
+            // Pre-compute dx, dy and offsets for bilinear interpolation
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
new file mode 100644
index 0000000000..04b3f14ce7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
new file mode 100644
index 0000000000..3b46fd78c1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
new file mode 100644
index 0000000000..8967a22ba1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel5x5::NESobel5x5()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
new file mode 100644
index 0000000000..f628da9709
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel7x7::NESobel7x7()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel7x7::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
new file mode 100644
index 0000000000..0651eab1bc
--- /dev/null
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cfloat>
+
+using namespace arm_compute;
+
+NESoftmaxLayer::NESoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Create intermediate tensors shapes
+    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _tmp.allocator()->init(tensor_info_tmp);
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+
+    // Allocate intermediate tensors
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void NESoftmaxLayer::run()
+{
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
new file mode 100644
index 0000000000..ebb8a0ac9b
--- /dev/null
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
new file mode 100644
index 0000000000..93dc124880
--- /dev/null
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
new file mode 100644
index 0000000000..53ac9c5ee3
--- /dev/null
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETranspose::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
new file mode 100644
index 0000000000..24fb16f9e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpAffine::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
new file mode 100644
index 0000000000..84b2df5bfa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpPerspective::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
new file mode 100644
index 0000000000..0cced73276
--- /dev/null
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <omp.h>
+
+using namespace arm_compute;
+
+OMPScheduler &OMPScheduler::get()
+{
+    static OMPScheduler scheduler;
+    return scheduler;
+}
+
+OMPScheduler::OMPScheduler()
+    : _num_threads(omp_get_max_threads())
+{
+}
+
+unsigned int OMPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+void OMPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = omp_get_max_threads();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        #pragma omp parallel num_threads(num_threads)
+        {
+            #pragma omp for
+            for(unsigned int t = 0; t < num_threads; ++t)
+            {
+                Window win = max_window.split_window(split_dimension, t, num_threads);
+                win.set_thread_id(t);
+                win.set_num_threads(num_threads);
+                kernel->run(win);
+            }
+        }
+    }
+}
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
new file mode 100644
index 0000000000..f1b6c93b50
--- /dev/null
+++ b/src/runtime/Pyramid.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Pyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void Pyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void Pyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    bool        is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const float c_orbscale[4] = { 0.5f,
+                                  SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+                                };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        (_pyramid.get() + i)->allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            float orb_scale = c_orbscale[(i + 1) % 4];
+            w               = static_cast<int>(std::ceil(static_cast<float>(ref_w) * orb_scale));
+            h               = static_cast<int>(std::ceil(static_cast<float>(ref_h) * orb_scale));
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void Pyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *Pyramid::info() const
+{
+    return &_info;
+}
+
+Tensor *Pyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
new file mode 100644
index 0000000000..a131928293
--- /dev/null
+++ b/src/runtime/Scheduler.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "arm_compute/core/Error.h"
+#if ARM_COMPUTE_CPP_SCHEDULER
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+#endif
+
+using namespace arm_compute;
+
+#if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#else
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
+#endif
+
+void Scheduler::set(Type t)
+{
+    ARM_COMPUTE_ERROR_ON(!Scheduler::is_available(t));
+    _scheduler_type = t;
+}
+
+bool Scheduler::is_available(Type t)
+{
+    switch(t)
+    {
+        case Type::ST:
+        {
+            return true;
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::CUSTOM:
+        {
+            return _custom_scheduler != nullptr;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            return false;
+        }
+    }
+}
+
+Scheduler::Type Scheduler::get_type()
+{
+    return _scheduler_type;
+}
+
+IScheduler &Scheduler::get()
+{
+    switch(_scheduler_type)
+    {
+        case Type::ST:
+        {
+            return SingleThreadScheduler::get();
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return CPPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
+#endif
+            break;
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return OMPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
+#endif
+            break;
+        }
+        case Type::CUSTOM:
+        {
+            if(_custom_scheduler == nullptr)
+            {
+                ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            }
+            else
+            {
+                return *_custom_scheduler;
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            break;
+        }
+    }
+    return SingleThreadScheduler::get();
+}
+
+std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+
+void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+{
+    _custom_scheduler = scheduler;
+    set(Type::CUSTOM);
+}
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
new file mode 100644
index 0000000000..32924be3dc
--- /dev/null
+++ b/src/runtime/SubTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *SubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *SubTensor::info()
+{
+    return &_info;
+}
+
+uint8_t *SubTensor::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->buffer();
+}
+
+ITensor *SubTensor::parent()
+{
+    return _parent;
+}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
new file mode 100644
index 0000000000..435068c61d
--- /dev/null
+++ b/src/runtime/Tensor.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+Tensor::Tensor()
+    : _allocator()
+{
+}
+
+ITensorInfo *Tensor::info() const
+{
+    return &_allocator.info();
+}
+
+ITensorInfo *Tensor::info()
+{
+    return &_allocator.info();
+}
+
+uint8_t *Tensor::buffer() const
+{
+    return _allocator.data();
+}
+
+TensorAllocator *Tensor::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
new file mode 100644
index 0000000000..5c719c761a
--- /dev/null
+++ b/src/runtime/TensorAllocator.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace
+{
+bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &child_info, const Coordinates &coords)
+{
+    bool               is_valid     = true;
+    const TensorShape &parent_shape = parent_info.tensor_shape();
+    const TensorShape &child_shape  = child_info.tensor_shape();
+    const size_t       parent_dims  = parent_info.num_dimensions();
+    const size_t       child_dims   = child_info.num_dimensions();
+
+    if(child_dims <= parent_dims)
+    {
+        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        {
+            const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
+
+            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            {
+                is_valid = false;
+                break;
+            }
+        }
+    }
+    else
+    {
+        is_valid = false;
+    }
+
+    return is_valid;
+}
+} // namespace
+
+TensorAllocator::TensorAllocator()
+    : _buffer(nullptr)
+{
+}
+
+void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
+{
+    // Get parent info
+    const TensorInfo parent_info = allocator.info();
+
+    // Check if coordinates and new shape are within the parent tensor
+    ARM_COMPUTE_ERROR_ON(!validate_subtensor_shape(parent_info, sub_info, coords));
+    ARM_COMPUTE_UNUSED(validate_subtensor_shape);
+
+    // Copy pointer to buffer
+    _buffer = allocator._buffer;
+
+    // Init tensor info with new dimensions
+    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+
+    // Set TensorInfo
+    init(sub_info);
+}
+
+uint8_t *TensorAllocator::data() const
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+
+    _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+    info().set_is_resizable(false);
+}
+
+void TensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+
+    _buffer.reset();
+    info().set_is_resizable(true);
+}
+
+uint8_t *TensorAllocator::lock()
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::unlock()
+{
+}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
new file mode 100644
index 0000000000..1b06117c7b
--- /dev/null
+++ b/src/runtime/Utils.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Utils.h"
+
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
+{
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
+    {
+        { Scheduler::Type::ST, "Single Thread" },
+        { Scheduler::Type::CPP, "C++11 Threads" },
+        { Scheduler::Type::OMP, "OpenMP Threads" },
+        { Scheduler::Type::CUSTOM, "Custom" }
+    };
+
+    return scheduler_type_map[t];
+}
author	Anthony Barbier <anthony.barbier@arm.com>	2017-09-04 18:44:23 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-09-17 13:03:09 +0100
commit	6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree	a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src
download	ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz