COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
author: Anthony Barbier <anthony.barbier@arm.com> 2017-09-04 18:44:23 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-09-17 13:03:09 +0100
commit: 6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree: a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src/core/CL/kernels
download: ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
63 files changed, 8686 insertions, 0 deletions
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..685b8e234e
--- /dev/null
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "The output image can only be U8 if both input images are U8");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
new file mode 100644
index 0000000000..6333f04e71
--- /dev/null
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate"));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, alpha);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
new file mode 100644
index 0000000000..83bbe6a3be
--- /dev/null
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
+    build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.insert(("-DA=" + val_to_string(act_info.a())));
+    build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..aaa62d0268
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
+    {
+        ARM_COMPUTE_ERROR("You called with the wrong data types.");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..4c847276da
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    }
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..309a153b7a
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    // Create kernel
+    std::string kernel_name = "batchnormalization_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, _epsilon);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, vector_slice);
+    add_1D_tensor_argument(idx, _var, vector_slice);
+    add_1D_tensor_argument(idx, _beta, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..5ea4a86da5
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseAndKernel::CLBitwiseAndKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..0098e15ab6
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..2eeef0a993
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseOrKernel::CLBitwiseOrKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c19a78e1c4
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseXorKernel::CLBitwiseXorKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
new file mode 100644
index 0000000000..e113d30210
--- /dev/null
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLBox3x3Kernel::border_size() const
+{
+    return 1;
+}
+
+void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
+                                         "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
+                                         "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
+                                         "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
new file mode 100644
index 0000000000..5d06d34631
--- /dev/null
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGradientKernel::CLGradientKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
+                             "Gx and Gy must have the same pixel size");
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
+                             "Mag must have the same pixel size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
+    _kernel                       = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
+    : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
+
+    _magnitude = magnitude;
+    _phase     = phase;
+    _output    = output;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts));
+
+    // Set minimum threshold argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lower_thr);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration    = 1;
+    constexpr unsigned int num_elems_read_written_per_iteration = 3;
+
+    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
+                                     num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mag_access, phase_access, output_access);
+
+    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeTraceKernel::CLEdgeTraceKernel()
+    : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
+{
+}
+
+void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
+
+    _input            = input;
+    _output           = output;
+    _lower_thr        = lower_thr;
+    _upper_thr        = upper_thr;
+    _visited          = visited;
+    _recorded         = recorded;
+    _l1_stack         = l1_stack;
+    _l1_stack_counter = l1_stack_counter;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts));
+
+    // Set constant kernel args
+    unsigned int width  = _input->info()->dimension(0);
+    unsigned int height = _input->info()->dimension(1);
+    unsigned int idx    = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(width));
+    _kernel.setArg(idx++, static_cast<cl_uint>(height));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
+                              output_access,
+                              visited_access,
+                              recorded_access,
+                              l1_stack_access,
+                              l1_stack_counter_access);
+
+    output_access.set_valid_region(win, _input->info()->valid_region());
+    visited_access.set_valid_region(win, _input->info()->valid_region());
+    recorded_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _visited, slice);
+        add_2D_tensor_argument(idx, _recorded, slice);
+        add_2D_tensor_argument(idx, _l1_stack, slice);
+        add_2D_tensor_argument(idx, _l1_stack_counter, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
new file mode 100644
index 0000000000..d729ebcfb3
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelCombineKernel::CLChannelCombineKernel()
+    : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
+{
+}
+
+void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+
+    const Format fmt = output->info()->format();
+    _planes[0]       = plane0;
+    _planes[1]       = plane1;
+    _planes[2]       = plane2;
+    if(Format::RGBA8888 == fmt)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+        _planes[3] = plane3;
+    }
+    else
+    {
+        _planes[3] = nullptr;
+    }
+    _output       = output;
+    _output_multi = nullptr;
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+    }
+    else
+    {
+        _x_subsampling = { { 1, 1, 1 } };
+        _y_subsampling = { { 1, 1, 1 } };
+    }
+
+    // Create kernel
+    std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+                                                       plane1->info()->valid_region(),
+                                                       plane2->info()->valid_region());
+    if(plane3 != nullptr)
+    {
+        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+    }
+    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+
+    _planes[0]           = plane0;
+    _planes[1]           = plane1;
+    _planes[2]           = plane2;
+    _planes[3]           = nullptr;
+    _output              = nullptr;
+    _output_multi        = output;
+    bool has_two_planars = false;
+
+    // Set sub-sampling parameters for each plane
+    const Format          fmt = output->info()->format();
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+
+    if(Format::NV12 == fmt || Format::NV21 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+        kernel_name    = "channel_combine_NV";
+        build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+        has_two_planars = true;
+    }
+    else
+    {
+        if(Format::IYUV == fmt)
+        {
+            _x_subsampling = { { 1, 2, 2 } };
+            _y_subsampling = { { 1, 2, 2 } };
+        }
+        else
+        {
+            _x_subsampling = { { 1, 1, 1 } };
+            _y_subsampling = { { 1, 1, 1 } };
+        }
+
+        kernel_name = "copy_planes_3p";
+        build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowRectangle  output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
+    ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        // Subsampling in plane 1
+        Window win_sub_plane1(slice);
+        win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
+        win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
+
+        // Subsampling in plane 2
+        Window win_sub_plane2(slice);
+        win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
+        win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
+
+        unsigned int idx = 0;
+
+        // Set inputs
+        add_2D_tensor_argument(idx, _planes[0], slice);
+        add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
+        add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
+
+        if(nullptr != _planes[3])
+        {
+            add_2D_tensor_argument(idx, _planes[3], slice);
+        }
+
+        // Set outputs
+        if(nullptr != _output) // Single planar output
+        {
+            add_2D_tensor_argument(idx, _output, slice);
+        }
+        else // Multi-planar output
+        {
+            // Reduce slice in case of subsampling to avoid out-of bounds access
+            slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
+
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
+
+            if(3 == num_planes_from_format(_output_multi->info()->format()))
+            {
+                add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2);
+            }
+
+            _kernel.setArg(idx++, slice.y().end());
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
new file mode 100644
index 0000000000..541153316a
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelExtractKernel::CLChannelExtractKernel()
+    : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
+{
+}
+
+void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    _input  = input;
+    _output = output;
+
+    // Check format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Create kernel
+    std::string           kernel_name = "channel_extract_" + string_from_format(format);
+    std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
+    _kernel                           = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    ValidRegion input_valid_region = input->info()->valid_region();
+    output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    // Get format
+    const Format fmt = input->info()->format();
+
+    // Get input plane
+    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel));
+    ARM_COMPUTE_ERROR_ON(nullptr == input_plane);
+
+    _output      = output;
+    _input       = input_plane;
+    _subsampling = 1;
+
+    // Create kernel
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+    if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt)
+    {
+        kernel_name = "copy_plane";
+    }
+    else
+    {
+        kernel_name = "channel_extract_" + string_from_format(fmt);
+        build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input_plane->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window win_sub(slice);
+        win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
+        win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, win_sub);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
new file mode 100644
index 0000000000..ad66c39483
--- /dev/null
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+CLCol2ImKernel::CLCol2ImKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+    _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_2D();
+    Window slice_out = window.first_slice_window_3D();
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_in);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
new file mode 100644
index 0000000000..ead2b8f092
--- /dev/null
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <sstream>
+
+using namespace arm_compute;
+
+CLColorConvertKernel::CLColorConvertKernel()
+    : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
+{
+}
+
+void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input = input;
+    _output      = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    const bool  has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    const float sub_sampling   = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                         sub_sampling, sub_sampling);
+    AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                        sub_sampling, sub_sampling);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              plane0_access, plane1_access, plane2_access,
+                              output_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+    float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    switch(input->info()->format())
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 2;
+                    break;
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input        = input;
+    _multi_output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+    {
+        win.set_dimension_step(Window::DimY, 2);
+    }
+
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+    AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_plane0_access,
+                              output_plane1_access,
+                              output_plane2_access);
+
+    ValidRegion input_region = input->info()->valid_region();
+
+    output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::YUV444:
+                case Format::NV12:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input  = input;
+    _multi_output = output;
+
+    // Create kernel
+    bool has_two_input_planars  = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+
+    float sub_sampling_input  = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+    float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                               sub_sampling_input, sub_sampling_input);
+    AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                              sub_sampling_input, sub_sampling_input);
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+    AccessWindowRectangle  output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    if(nullptr != _input && nullptr != _output)
+    {
+        do
+        {
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _input && nullptr != _multi_output)
+    {
+        Format format = _multi_output->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _output)
+    {
+        Format format = _multi_input->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
+            }
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _multi_output)
+    {
+        Format in_format  = _multi_input->info()->format();
+        Format out_format = _multi_output->info()->format();
+        do
+        {
+            Window win_in_uv(slice);
+            if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
+            {
+                win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
+                                                              win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
+                win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
+            }
+
+            Window win_out_uv(slice);
+            if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
+            {
+                win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
+                                                               win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
+                win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
+            }
+
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
new file mode 100644
index 0000000000..bdfe398a1d
--- /dev/null
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+#define MAX_MATRIX_SIZE 81
+
+/****************************************************************************************\
+ *                                 Square Convolution                                *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+BorderSize             CLConvolutionKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+
+    _input  = input;
+    _output = output;
+
+    std::stringstream     kernel_name;
+    std::set<std::string> options;
+    kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
+
+    if(scale == 0)
+    {
+        scale = calculate_matrix_scale(conv, matrix_size);
+    }
+
+    for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << conv[i];
+        options.insert(mat_str.str());
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(out_type.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Separable Convolution                                *
+\****************************************************************************************/
+template <unsigned int matrix_size>
+CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
+    : _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=0");
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
+                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+    ARM_COMPUTE_ERROR_ON(scale == 0);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=" + val_to_string(scale));
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+
+    build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    build_opts.insert(out_type.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
+    : _border_size(0), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLConvolutionRectangleKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == conv);
+    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
+    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+    ARM_COMPUTE_ERROR_ON(0 == scale);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(height / 2, width / 2);
+
+    std::set<std::string> options;
+
+    std::stringstream output_type;
+    output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(output_type.str());
+
+    uint32_t matrix_size = width * height;
+
+    int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+    {
+        options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
+    options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    const unsigned int     num_rows_read_per_iteration       = height;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+template class arm_compute::CLConvolutionKernel<3>;
+template class arm_compute::CLConvolutionKernel<5>;
+template class arm_compute::CLConvolutionKernel<7>;
+template class arm_compute::CLConvolutionKernel<9>;
+template class arm_compute::CLSeparableConvolutionVertKernel<5>;
+template class arm_compute::CLSeparableConvolutionVertKernel<7>;
+template class arm_compute::CLSeparableConvolutionVertKernel<9>;
+template class arm_compute::CLSeparableConvolutionHorKernel<5>;
+template class arm_compute::CLSeparableConvolutionHorKernel<7>;
+template class arm_compute::CLSeparableConvolutionHorKernel<9>;
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..73f1ba15df
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           output->info()->strides_in_bytes()[1];
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
new file mode 100644
index 0000000000..24608bd17c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+
+    // Check if convertion is supported
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
+                                                                            && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] S16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] U32 ->  [out] U8, U16, S16");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] S32 ->  [out] U8, U16, S16");
+
+    // Get data sizes
+    const size_t input_size  = data_size_from_type(input->info()->data_type());
+    const size_t output_size = data_size_from_type(output->info()->data_type());
+
+    // Construct kernel name and build options
+    std::string           kernel_name = "convert_depth";
+    std::set<std::string> build_opts;
+    if(input_size > output_size)
+    {
+        kernel_name += "_down";
+        build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE");
+    }
+    else
+    {
+        kernel_name += "_up";
+    }
+    build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set shift arg
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
new file mode 100644
index 0000000000..36ba06d528
--- /dev/null
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLDerivativeKernel::CLDerivativeKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
+{
+}
+
+BorderSize CLDerivativeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_derivative_x = output_x != nullptr;
+    _run_derivative_y = output_y != nullptr;
+
+    if(_run_derivative_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_derivative_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_derivative_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_derivative_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_read_rows_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, 0, 0, 0);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+    if(_run_derivative_x && _run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+    else if(_run_derivative_x)
+    {
+        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration);
+    }
+    else if(_run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_derivative_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_derivative_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
new file mode 100644
index 0000000000..3abd747011
--- /dev/null
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLDilateKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
new file mode 100644
index 0000000000..a7aa88fc5c
--- /dev/null
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLErodeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_pes_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
new file mode 100644
index 0000000000..1d4d776730
--- /dev/null
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLFastCornersKernel::CLFastCornersKernel()
+    : ICLKernel(), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLFastCornersKernel::border_size() const
+{
+    return BorderSize(3);
+}
+
+void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> build_opts;
+
+    if(non_max_suppression)
+    {
+        build_opts.emplace("-DUSE_MAXSUPPRESSION");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 7;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLCopyToArrayKernel::CLCopyToArrayKernel()
+    : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
+{
+}
+
+void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(corners == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
+
+    _input      = input;
+    _corners    = corners;
+    _num_buffer = num_buffers;
+
+    std::set<std::string> build_opts;
+
+    if(update_number)
+    {
+        build_opts.emplace("-DUPDATE_NUMBER");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts));
+
+    //Get how many pixels skipped in the x dimension in the previous stages
+    unsigned int offset = _input->info()->valid_region().anchor.x();
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx++, corners->max_num_values());
+    _kernel.setArg<cl_uint>(idx++, offset);
+    _kernel.setArg(idx++, *_num_buffer);
+    _kernel.setArg(idx++, _corners->cl_buffer());
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    //Initialise the _num_buffer as it used as both input and output
+    static const unsigned int zero_init = 0;
+    queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
new file mode 100644
index 0000000000..981aad665a
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLFillBorderKernel::CLFillBorderKernel()
+    : ICLKernel(), _tensor(nullptr)
+{
+}
+
+bool CLFillBorderKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <class T>
+void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+    T value;
+    constant_border_value.get(value);
+    ICLKernel::add_argument<T>(idx, static_cast<T>(value));
+}
+
+void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+    border_size.limit(tensor->info()->padding());
+
+    // If there is no border: early exit
+    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+
+    // Select appropriate kernel
+    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+    // Define select type required by replicate border > 1
+    const DataType dt          = tensor->info()->data_type();
+    std::string    select_type = get_cl_type_from_data_type(dt);
+    if(is_data_type_float(dt))
+    {
+        select_type = (DataType::F32 == dt) ? "int" : "short";
+    }
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.emplace(("-DSELECT_TYPE=" + select_type));
+    build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
+    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
+    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
+    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _tensor = tensor;
+
+    // Create static kernel arguments
+    const unsigned int valid_width  = tensor->info()->valid_region().shape[0];
+    const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+    const cl_int2      valid_region_coords =
+    {
+        {
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+        }
+    };
+    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    ICLKernel::add_argument<cl_uint>(idx, valid_width);
+    ICLKernel::add_argument<cl_uint>(idx, valid_height);
+    ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
+    if(BorderMode::CONSTANT == border_mode)
+    {
+        switch(dt)
+        {
+            case DataType::U8:
+                set_constant_border<uint8_t>(idx, constant_border_value);
+                break;
+            case DataType::U16:
+                set_constant_border<uint16_t>(idx, constant_border_value);
+                break;
+            case DataType::S16:
+                set_constant_border<int16_t>(idx, constant_border_value);
+                break;
+            case DataType::U32:
+                set_constant_border<uint32_t>(idx, constant_border_value);
+                break;
+            case DataType::S32:
+                set_constant_border<int32_t>(idx, constant_border_value);
+                break;
+            case DataType::F32:
+                static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                set_constant_border<float>(idx, constant_border_value);
+                break;
+            case DataType::F16:
+                static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
+                set_constant_border<cl_half>(idx, constant_border_value);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not handled");
+        }
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+    ICLKernel::configure(win);
+}
+
+void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel() == nullptr)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _tensor, slice);
+        enqueue(queue, *this, slice, cl::NullRange);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..71d42c5606
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::string data_type_name;
+    data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+    _kernel        = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    /*
+     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     */
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = window.first_slice_window_2D();
+
+    // Change x and y steps for the slide of output tensor
+    out_slice.scale(Window::DimX, 4.f);
+    out_slice.scale(Window::DimY, 0.25f);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..c6e05b92a2
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Create kernel and set static arguments
+    std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+    unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<int32_t>(idx++, a_offset);
+    _kernel.setArg<int32_t>(idx++, b_offset);
+    _kernel.setArg<int32_t>(idx++, output_offset);
+    _kernel.setArg<int32_t>(idx++, output_mult_int);
+    _kernel.setArg<int32_t>(idx++, shift);
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..289873c23f
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+
+    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, biases_access, accum_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window accum_slice = window.first_slice_window_2D();
+
+    Window biases_slice(accum_slice);
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _accum, accum_slice);
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+
+        enqueue(queue, *this, accum_slice);
+    }
+    while(window.slide_window_slice_2D(accum_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..343838f2f9
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    std::ostringstream ma_arguments;
+    ma_arguments << "-DBETA=" << beta;
+    std::set<std::string> build_opts;
+    build_opts.emplace(ma_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..d7388e8579
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    if(output->info()->dimension(1) == 1)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream mm_arguments;
+    mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
+    mm_arguments << "-DALPHA=" << alpha << " ";
+    std::set<std::string> build_opts;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(output->info()->dimension(1) == 1)
+    {
+        mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+        _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
+
+        // Configure window kernel
+        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+
+        if(data_type_name == "f32")
+        {
+            GPUTarget arch_target = get_arch_from_target(get_target());
+            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+        }
+        else
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+        }
+
+        // Configure window kernel
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+}
+
+void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ecee1abd72
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+     */
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    std::string kernel_name    = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    float scale_x = 1.f;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            scale_x = 16.f;
+            break;
+        case DataType::F16:
+            scale_x = 8.f;
+            break;
+        case DataType::F32:
+            scale_x = 4.f;
+            break;
+        default:
+            // Do nothing
+            break;
+    }
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..e5bc3f9656
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLGaussian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
+                                         "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
+                                         "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
+                                         "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..bd523c883d
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+}
+
+void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const uint32_t scale    = 256;
+    const int16_t  matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..34a228c717
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
+    : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidHorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 20;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr float        scale_x                           = 0.5f;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+    // a pixel is even or odd is determined based on the tensor shape not the
+    // valid region!)
+    // Thus the offset from which the first pixel (L2) for the convolution is
+    // loaded depends on the anchor and shape of the valid region.
+    // In the case of an even shape (= even image width) we need to load L2
+    // from -2 if the anchor is odd and from -1 if the anchor is even. That
+    // makes sure that L2 is always loaded from an odd pixel.
+    // On the other hand, for an odd shape (= odd image width) we need to load
+    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+    // achieve the opposite effect.
+    // The condition can be simplified to checking whether anchor + shape is
+    // odd (-2) or even (-1) as only adding an odd and an even number will have
+    // an odd result.
+    _l2_load_offset = -border_size().left;
+
+    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+    {
+        _l2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, _l2_load_offset);
+
+    //The output is half the width of the input:
+    Window win_out(window);
+    win_out.scale(Window::DimX, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
+
+CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
+    : _t2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidVertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_rows_processed_per_iteration  = 2;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_per_iteration            = 5;
+    constexpr float        scale_y                           = 0.5f;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
+                                      border_undefined, border_size());
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
+
+    // Determine whether we need to load even or odd rows. See above for a
+    // detailed explanation.
+    _t2_load_offset = -border_size().top;
+
+    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+    {
+        _t2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
+    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+
+    Window win_in(window);
+    win_in.shift(Window::DimY, _t2_load_offset);
+
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..87659c4ba9
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_size       = hog_info->cell_size();
+
+    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    std::stringstream args_str;
+    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_mag_phase = window.first_slice_window_2D();
+        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+
+    std::stringstream args_str;
+    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_in = window.first_slice_window_2D();
+        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..0f9a98950d
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+                                    float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                 = input;
+    _detection_windows     = detection_windows;
+    _num_detection_windows = num_detection_windows;
+
+    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    std::stringstream args_str;
+    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+    args_str << "-DTHRESHOLD=" << threshold << " ";
+    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+    args_str << "-DIDX_CLASS=" << idx_class << " ";
+    args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+    args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+    _kernel.setArg(idx++, hog->cl_buffer());
+    _kernel.setArg(idx++, detection_windows->cl_buffer());
+    _kernel.setArg(idx++, *_num_detection_windows);
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..9fc34a7760
--- /dev/null
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHarrisScoreKernel::CLHarrisScoreKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
+{
+}
+
+BorderSize CLHarrisScoreKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                                    bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    // Select kernel
+    std::stringstream harris_score_kernel_name;
+    harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
+
+    // Create build options
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, sensitivity);
+    _kernel.setArg(idx++, strength_thresh);
+    _kernel.setArg(idx++, norm_factor);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_written_per_iteration   = 4;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
new file mode 100644
index 0000000000..87ee5fb74e
--- /dev/null
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <string>
+
+using namespace arm_compute;
+
+// each thread handle 16 pixels
+constexpr signed int pixels_per_item = 16;
+
+// local work group size in X dimension
+constexpr unsigned int local_x_size = 16;
+
+CLHistogramKernel::CLHistogramKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    _input  = input;
+    _output = output;
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+    unsigned int bin_size    = _output->size();
+    unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, buffer_size, nullptr);
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(0, end_position, pixels_per_item));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
+
+    ICLKernel::configure(win);
+}
+
+void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    _output->map(queue, true);
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->size());
+    _output->unmap(queue);
+
+    Window      slice = window.first_slice_window_2D();
+    cl::NDRange lws   = cl::NDRange(local_x_size, 1);
+
+    do
+    {
+        /* Run the core part which has width can be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHistogramBorderKernel::CLHistogramBorderKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    if(start_position >= input->info()->dimension(0))
+    {
+        return; // no need to run histogram border kernel
+    }
+
+    _input  = input;
+    _output = output;
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
+    ICLKernel::configure(win);
+}
+
+void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    if(window.x().start() >= window.x().end())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    cl::NDRange lws = cl::NDRange(1, 1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        /* Run the border part which has width cannot be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
new file mode 100644
index 0000000000..8c0fe26666
--- /dev/null
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLIm2ColKernel::CLIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                    input->info()->tensor_shape().cend(),
+                                                    output->info()->tensor_shape().cbegin() + 1))
+                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    if(!run_img2col_reduced)
+    {
+        _convolved_dims                    = convolved_dims;
+        _conv_info                         = conv_info;
+        _kernel_size                       = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+        _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+        // Create static kernel arguments
+        const cl_int2 input_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)),
+                static_cast<cl_int>(input->info()->dimension(1)),
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                stride_x,
+                stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pad_x,
+                pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int>(idx++, _kernel_size);
+        _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
+        _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
+        _kernel.setArg<cl_int2>(idx++, input_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+
+        _run_func = &CLIm2ColKernel::run_generic;
+    }
+    else
+    {
+        _num_elems_processed_per_iteration = 1;
+        _kernel                            = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+        _run_func                          = &CLIm2ColKernel::run_reduced;
+    }
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+    (this->*_run_func)(window, queue);
+}
+
+void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    std::tie(stride_x, stride_y) = _conv_info.stride();
+
+    // Get initial windows
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+}
+
+void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_3D();
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _output, out_slice);
+
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
new file mode 100644
index 0000000000..69ede457df
--- /dev/null
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+CLIntegralImageVertKernel::CLIntegralImageVertKernel()
+    : _in_out(nullptr)
+{
+}
+
+void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
+
+    _in_out = in_out;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
+
+    Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, in_out_access);
+
+    in_out_access.set_valid_region(win, in_out->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const size_t height = _in_out->info()->dimension(1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _in_out, slice);
+        _kernel.setArg<cl_uint>(idx++, height);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
new file mode 100644
index 0000000000..12cdd0ec93
--- /dev/null
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                                      ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                      bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
+
+{
+    ARM_COMPUTE_ERROR_ON(old_points == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+
+    const float scale = std::pow(pyramid_scale, level);
+
+    // Create kernel
+    std::string kernel_name = "init_level";
+    if(level == (num_levels - 1))
+    {
+        kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    if(level == (num_levels - 1))
+    {
+        _kernel.setArg(idx++, old_points->cl_buffer());
+        if(use_initial_estimate)
+        {
+            _kernel.setArg(idx++, new_points_estimates->cl_buffer());
+        }
+    }
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg<cl_float>(idx++, scale);
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
+
+{
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points == nullptr);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize"));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points->cl_buffer());
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
+    : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
+{
+}
+
+void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                                        ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                        ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _old_input     = old_input;
+    _old_scharr_gx = old_scharr_gx;
+    _old_scharr_gy = old_scharr_gy;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion valid_region = intersect_valid_regions(
+                                         old_input->info()->valid_region(),
+                                         old_scharr_gx->info()->valid_region(),
+                                         old_scharr_gy->info()->valid_region());
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0"));
+
+    // Set arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+}
+
+void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _old_input, window);
+    add_2D_tensor_argument(idx, _old_scharr_gx, window);
+    add_2D_tensor_argument(idx, _old_scharr_gy, window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
+    : _new_input(nullptr)
+{
+}
+
+void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _new_input = new_input;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion &valid_region = new_input->info()->valid_region();
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+    const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+    const int term_epsilon   = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_int>(idx++, num_iterations);
+    _kernel.setArg<cl_float>(idx++, epsilon);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+    _kernel.setArg<cl_int>(idx++, term_iteration);
+    _kernel.setArg<cl_int>(idx++, term_epsilon);
+}
+
+void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _new_input, window);
+
+    enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..794a1bc56e
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream    mm_arguments;
+    std::set<std::string> build_opts;
+
+    mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+    build_opts.emplace(mm_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+    // Configure window kernel
+    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    Window matrix_b_window;
+    matrix_b_window.use_tensor_dimensions(_input1->info());
+    Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..c504189169
--- /dev/null
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
+{
+}
+
+void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                                       MagnitudeType mag_type, PhaseType phase_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+
+    _run_mag   = (magnitude != nullptr);
+    _run_phase = (phase != nullptr);
+    if(_run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
+    }
+    if(_run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    }
+
+    if(!_run_mag && !_run_phase)
+    {
+        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+
+    // Add magnitude type
+    if(_run_mag)
+    {
+        switch(mag_type)
+        {
+            case MagnitudeType::L1NORM:
+                build_opts.insert("-DMAGNITUDE=1");
+                break;
+            case MagnitudeType::L2NORM:
+                build_opts.insert("-DMAGNITUDE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
+                build_opts.insert("-DMAGNITUDE=0");
+                break;
+        }
+    }
+
+    // Add phase type
+    if(_run_phase)
+    {
+        switch(phase_type)
+        {
+            case PhaseType::UNSIGNED:
+                build_opts.insert("-DPHASE=1");
+                break;
+            case PhaseType::SIGNED:
+                build_opts.insert("-DPHASE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
+                build_opts.insert("-DPHASE=0");
+                break;
+        }
+    }
+
+    // Add data_type
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              gx_access, gy_access,
+                              output_magnitude_access, output_phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+    output_magnitude_access.set_valid_region(win, valid_region);
+    output_phase_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+
+        if(_run_mag)
+        {
+            add_2D_tensor_argument(idx, _magnitude, slice);
+        }
+
+        if(_run_phase)
+        {
+            add_2D_tensor_argument(idx, _phase, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..b0b748f466
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMeanStdDevKernel::CLMeanStdDevKernel()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+{
+}
+
+void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == mean);
+    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+
+    _input              = input;
+    _mean               = mean;
+    _stddev             = stddev;
+    _global_sum         = global_sum;
+    _global_sum_squared = global_sum_squared;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    if(_stddev != nullptr)
+    {
+        build_opts.insert("-DSTDDEV");
+    }
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
+
+    _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
+    _kernel.setArg(idx++, *_global_sum);
+
+    if(_stddev != nullptr)
+    {
+        _kernel.setArg(idx++, *_global_sum_squared);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    update_window_and_padding(win, input_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Clear sums
+    static const cl_ulong zero = 0;
+    queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+    }
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        // Set slice step equal to height to force gws[1] to 1,
+        // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
+        slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+
+    // Calculate mean and stddev
+    cl_ulong    global_sum         = 0;
+    cl_ulong    global_sum_squared = 0;
+    const float num_pixels         = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+    queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
+    const float mean = global_sum / num_pixels;
+    *_mean           = mean;
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
+        *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
+    }
+}
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..95334c7b5f
--- /dev/null
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLMedian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" }));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..939a53b03a
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxKernel::CLMinMaxKernel()
+    : _input(nullptr), _min_max(), _data_type_max_min()
+{
+}
+
+void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+
+    _input                                               = input;
+    _min_max                                             = min_max;
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            _data_type_max_min[0] = UCHAR_MAX;
+            _data_type_max_min[1] = 0;
+            break;
+        case DataType::S16:
+            _data_type_max_min[0] = SHRT_MAX;
+            _data_type_max_min[1] = SHRT_MIN;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong image data types");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
+    build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
+    build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *_min_max);
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Reset mininum and maximum values
+    queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLMinMaxLocationKernel::CLMinMaxLocationKernel()
+    : _input(nullptr), _min_max_count(nullptr)
+{
+}
+
+void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+    ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
+
+    _input         = input;
+    _min_max_count = min_max_count;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
+    build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
+    build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *min_max);
+    _kernel.setArg(idx++, *min_max_count);
+    if(min_loc != nullptr)
+    {
+        _kernel.setArg(idx++, min_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
+    }
+    if(max_loc != nullptr)
+    {
+        _kernel.setArg(idx++, max_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    static const unsigned int zero_count = 0;
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..6afa5822ba
--- /dev/null
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLNonLinearFilterKernel::CLNonLinearFilterKernel()
+    : _border_size(0)
+{
+}
+
+BorderSize CLNonLinearFilterKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                        bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
+    ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
+    ARM_COMPUTE_UNUSED(mask);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(mask_size / 2);
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
+
+    // Define kernel
+    std::string pattern_name = string_from_matrix_pattern(pattern);
+    std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
+    std::stringstream ss;
+    ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    const unsigned int     num_rows_read_per_iteration       = mask_size;
+
+    Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..6a96b0effd
--- /dev/null
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..106a5113db
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerKernel::CLNormalizationLayerKernel()
+    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize CLNormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input         = input;
+    _squared_input = squared_input;
+    _output        = output;
+
+    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    // Create kernel
+    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
+    _kernel.setArg<cl_float>(idx++, norm_info.beta());
+    _kernel.setArg<cl_float>(idx++, norm_info.kappa());
+    _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _squared_input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..84eb434bc9
--- /dev/null
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    int scale_int = -1;
+    // Extract sign, exponent and mantissa
+    int   exponent            = 0;
+    float normalized_mantissa = std::frexp(scale, &exponent);
+    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+    // Moreover, it will be negative as we deal with 1/2^n
+    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    {
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_int = std::abs(exponent - 1);
+    }
+
+    std::string data_type;
+    std::string compute_type;
+    // Check if it has float inputs and output
+    if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
+    {
+        scale_int    = -1;
+        compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+        data_type    = "DATA_TYPE_FLOAT";
+    }
+    else
+    {
+        compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
+        data_type    = "DATA_TYPE_INT";
+    }
+
+    // Construct kernel name
+    std::string kernel_name = "pixelwise_mul";
+    kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+    build_opts.emplace("-D" + data_type);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set scale argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+
+    if(scale_int >= 0)
+    {
+        _kernel.setArg(idx++, scale_int);
+    }
+    else
+    {
+        _kernel.setArg(idx++, scale);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..dc5ae4ec7a
--- /dev/null
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLPoolingLayerKernel::CLPoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+{
+}
+
+BorderSize CLPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                   pool_pad_x      = 0;
+    int                   pool_pad_y      = 0;
+    int                   pool_stride_x   = 0;
+    int                   pool_stride_y   = 0;
+    unsigned int          pooled_w        = 0;
+    unsigned int          pooled_h        = 0;
+    const PoolingType     pool_type       = pool_info.pool_type();
+    const int             pool_size       = pool_info.pool_size();
+    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
+    DimensionRoundingType pool_round      = pad_stride_info.round();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_stride_x, pool_stride_y,
+                                                     pool_pad_x, pool_pad_y,
+                                                     pool_round);
+    ARM_COMPUTE_UNUSED(pooled_w);
+    ARM_COMPUTE_UNUSED(pooled_h);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    const int input_width   = input->info()->dimension(0);
+    const int input_height  = input->info()->dimension(1);
+    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    // Set instance variables
+    _input              = input;
+    _output             = output;
+    _pool_info          = pool_info;
+    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
+    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+
+    // Create kernel
+    std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set static kernel arguments
+    if(pool_type == PoolingType::AVG)
+    {
+        // Create static kernel arguments
+        const cl_int2 max_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
+                static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                pool_stride_x,
+                pool_stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pool_pad_x,
+                pool_pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = 2 * num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int2>(idx++, max_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        // Upsample input by pool size
+        Window in_slice(slice);
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
new file mode 100644
index 0000000000..e63a5ef7c6
--- /dev/null
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLRemapKernel::CLRemapKernel()
+    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize CLRemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "remap_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window             win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input_access(output->info(), -border_offset, -border_offset,
+                                    _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
+}
+
+void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _map_x, slice);
+        add_2D_tensor_argument(idx, _map_y, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
new file mode 100644
index 0000000000..d74e837ace
--- /dev/null
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    /* Compute the ratio between source width/height and destination width/height */
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+    }
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "scale_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
+                                    input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<float>(idx++, input->info()->dimension(1));
+    _kernel.setArg<float>(idx++, output->info()->dimension(0));
+    _kernel.setArg<float>(idx++, output->info()->dimension(1));
+}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..913ef592d4
--- /dev/null
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLScharr3x3Kernel::CLScharr3x3Kernel()
+    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize CLScharr3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_scharr_x = output_x != nullptr;
+    _run_scharr_y = output_y != nullptr;
+
+    if(_run_scharr_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_scharr_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_scharr_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_scharr_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_scharr_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_scharr_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
new file mode 100644
index 0000000000..436aaa498a
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel3x3Kernel::CLSobel3x3Kernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
new file mode 100644
index 0000000000..4c0316f19e
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel5x5HorKernel::CLSobel5x5HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel5x5VertKernel::CLSobel5x5VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
new file mode 100644
index 0000000000..a477953cfb
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel7x7HorKernel::CLSobel7x7HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel7x7HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+    // Construct kernel name
+    std::string kernel_name = "sobel_separable1x7";
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel7x7VertKernel::CLSobel7x7VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel7x7VertKernel::border_size() const
+{
+    return BorderSize(3, 0);
+}
+
+void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..0470d5243e
--- /dev/null
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_iteration = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _max, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLLogits1DNormKernel::CLLogits1DNormKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _sum, sum_slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
new file mode 100644
index 0000000000..bbdaa37410
--- /dev/null
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstdint>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+    ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    // Create kernel
+    std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set lut argument
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lut->cl_buffer());
+    if(DataType::S16 == lut->type())
+    {
+        _kernel.setArg(idx++, lut->index_offset());
+        _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
+    }
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
new file mode 100644
index 0000000000..6e07cefc77
--- /dev/null
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                                  uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Construct kernel name
+    std::string kernel_name = "threshold";
+
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            kernel_name += "_binary";
+            break;
+        case ThresholdType::RANGE:
+            kernel_name += "_range";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognized");
+            break;
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, false_value);
+    _kernel.setArg(idx++, true_value);
+    _kernel.setArg(idx++, threshold);
+
+    if(ThresholdType::RANGE == type)
+    {
+        _kernel.setArg(idx++, upper);
+    }
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
new file mode 100644
index 0000000000..2ee6fcb9dc
--- /dev/null
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input    = input;
+    _output   = output;
+    _lws_hint = cl::NDRange(2, 8);
+
+    std::set<std::string> build_opts;
+    std::ostringstream    data_type_in_bytes;
+    data_type_in_bytes << input->info()->element_size();
+    build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size();
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
new file mode 100644
index 0000000000..e549dbc258
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpAffineKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 6);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_affine_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
new file mode 100644
index 0000000000..fddb580750
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpPerspectiveKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 9);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_perspective_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..018f272921
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    if(_is_shared)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+    }
+
+    // Check biases
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    }
+
+    _biases = biases;
+    _output = output;
+    _input  = input;
+
+    // Create build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    // Set arguments
+    unsigned idx = 0;
+    add_3D_tensor_argument(idx, _input, in_slice);
+    add_2D_tensor_argument(idx, _output, out_slice);
+    if(_biases != nullptr)
+    {
+        Window biases_slice;
+        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+    }
+
+    // Run kernel
+    enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    if(_biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(_biases->info());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, biases_slice);
+            biases_window.slide_window_slice_1D(biases_slice);
+        }
+
+        // Run kernel
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
author	Anthony Barbier <anthony.barbier@arm.com>	2017-09-04 18:44:23 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-09-17 13:03:09 +0100
commit	6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree	a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src/core/CL/kernels
download	ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz