From 04a8f8c4994f1c32b3f16a832c0e6f2599364c02 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Thu, 23 Nov 2017 11:45:24 +0000
Subject: COMPMID-692 Consistent names for the interfaces

Change-Id: I4b1f3f0da9ff5342c7de7083736fe91871d14e5b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110351
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/kernels/CLDepthConcatenateKernel.cpp   | 139 ------
 .../CL/kernels/CLDepthConcatenateLayerKernel.cpp   | 139 ++++++
 src/core/CL/kernels/CLDepthConvertKernel.cpp       | 117 -----
 src/core/CL/kernels/CLDepthConvertLayerKernel.cpp  | 117 +++++
 .../CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp | 219 ---------
 .../CLDepthwiseConvolutionLayer3x3Kernel.cpp       | 219 +++++++++
 src/core/CL/kernels/CLL2NormalizeKernel.cpp        | 110 -----
 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp   | 110 +++++
 .../kernels/GCDepthConcatenateKernel.cpp           | 143 ------
 .../kernels/GCDepthConcatenateLayerKernel.cpp      | 143 ++++++
 src/core/NEON/kernels/NEDepthConcatenateKernel.cpp | 170 -------
 .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 170 +++++++
 src/core/NEON/kernels/NEDepthConvertKernel.cpp     | 524 ---------------------
 .../NEON/kernels/NEDepthConvertLayerKernel.cpp     | 524 +++++++++++++++++++++
 .../kernels/NEDepthwiseConvolution3x3Kernel.cpp    | 186 --------
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp       | 186 ++++++++
 src/core/NEON/kernels/NEL2NormalizeKernel.cpp      | 126 -----
 src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 126 +++++
 src/graph/operations/CLSimpleOperations.cpp        |  10 +-
 src/graph/operations/NESimpleOperations.cpp        |  10 +-
 src/runtime/CL/functions/CLDepthConcatenate.cpp    |  78 ---
 .../CL/functions/CLDepthConcatenateLayer.cpp       |  78 +++
 src/runtime/CL/functions/CLDepthConvert.cpp        |  38 --
 src/runtime/CL/functions/CLDepthConvertLayer.cpp   |  38 ++
 .../CL/functions/CLDepthwiseConvolution.cpp        | 138 ------
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   | 138 ++++++
 src/runtime/CL/functions/CLL2Normalize.cpp         |  63 ---
 src/runtime/CL/functions/CLL2NormalizeLayer.cpp    |  63 +++
 src/runtime/CL/functions/CLLaplacianPyramid.cpp    |   2 +-
 .../GLES_COMPUTE/functions/GCDepthConcatenate.cpp  |  69 ---
 .../functions/GCDepthConcatenateLayer.cpp          |  69 +++
 src/runtime/NEON/functions/NEDepthConcatenate.cpp  |  74 ---
 .../NEON/functions/NEDepthConcatenateLayer.cpp     |  74 +++
 src/runtime/NEON/functions/NEDepthConvert.cpp      |  38 --
 src/runtime/NEON/functions/NEDepthConvertLayer.cpp |  38 ++
 .../NEON/functions/NEDepthwiseConvolution.cpp      | 126 -----
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 126 +++++
 src/runtime/NEON/functions/NEL2Normalize.cpp       |  57 ---
 src/runtime/NEON/functions/NEL2NormalizeLayer.cpp  |  57 +++
 src/runtime/NEON/functions/NELaplacianPyramid.cpp  |   2 +-
 40 files changed, 2427 insertions(+), 2427 deletions(-)
 delete mode 100644 src/core/CL/kernels/CLDepthConcatenateKernel.cpp
 create mode 100644 src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLDepthConvertKernel.cpp
 create mode 100644 src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
 delete mode 100644 src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
 create mode 100644 src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
 delete mode 100644 src/core/CL/kernels/CLL2NormalizeKernel.cpp
 create mode 100644 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
 delete mode 100644 src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
 create mode 100644 src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDepthConvertKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp
 create mode 100644 src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEL2NormalizeKernel.cpp
 create mode 100644 src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
 delete mode 100644 src/runtime/CL/functions/CLDepthConcatenate.cpp
 create mode 100644 src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/CL/functions/CLDepthConvert.cpp
 create mode 100644 src/runtime/CL/functions/CLDepthConvertLayer.cpp
 delete mode 100644 src/runtime/CL/functions/CLDepthwiseConvolution.cpp
 create mode 100644 src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
 delete mode 100644 src/runtime/CL/functions/CLL2Normalize.cpp
 create mode 100644 src/runtime/CL/functions/CLL2NormalizeLayer.cpp
 delete mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp
 create mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEDepthConcatenate.cpp
 create mode 100644 src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEDepthConvert.cpp
 create mode 100644 src/runtime/NEON/functions/NEDepthConvertLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEDepthwiseConvolution.cpp
 create mode 100644 src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
 delete mode 100644 src/runtime/NEON/functions/NEL2Normalize.cpp
 create mode 100644 src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

(limited to 'src')

diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
deleted file mode 100644
index edfbf829ed..0000000000
--- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-#include <map>
-
-using namespace arm_compute;
-
-CLDepthConcatenateKernel::CLDepthConcatenateKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
-{
-}
-
-BorderSize CLDepthConcatenateKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
-void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
-{
-    static std::map<int, std::pair<std::string, int>> configs_map =
-    {
-        { 1, { "uchar", 16 } },
-        { 2, { "ushort", 8 } },
-        { 4, { "uint", 4 } },
-        { 8, { "ulong", 2 } },
-    };
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-    ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end());
-
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
-
-    _input        = input;
-    _output       = output;
-    _depth_offset = depth_offset;
-
-    // Add build options
-    auto                  config = configs_map.find(static_cast<int>(input->info()->element_size()));
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + config->second.first));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second)));
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
-
-    // Configure kernel window
-    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
-    const unsigned int num_rows_read_per_iteration       = 1;
-
-    // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
-
-    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
-}
-
-void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
-
-    unsigned int  idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    const cl_int3 offsets =
-    {
-        {
-            static_cast<cl_int>(_left_right),
-            static_cast<cl_int>(_top_bottom),
-            static_cast<cl_int>(offset_to_first_elements_in_bytes),
-        }
-    };
-    _kernel.setArg<cl_int3>(idx, offsets);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000000..0275d4fd83
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+
+CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize CLDepthConcatenateLayerKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+    static std::map<int, std::pair<std::string, int>> configs_map =
+    {
+        { 1, { "uchar", 16 } },
+        { 2, { "ushort", 8 } },
+        { 4, { "uint", 4 } },
+        { 8, { "ulong", 2 } },
+    };
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+    ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end());
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+
+    // Add build options
+    auto                  config = configs_map.find(static_cast<int>(input->info()->element_size()));
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + config->second.first));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
+
+    unsigned int  idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    const cl_int3 offsets =
+    {
+        {
+            static_cast<cl_int>(_left_right),
+            static_cast<cl_int>(_top_bottom),
+            static_cast<cl_int>(offset_to_first_elements_in_bytes),
+        }
+    };
+    _kernel.setArg<cl_int3>(idx, offsets);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
deleted file mode 100644
index b2132073d5..0000000000
--- a/src/core/CL/kernels/CLDepthConvertKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
-                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
-                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
-    ARM_COMPUTE_ERROR_ON(shift >= 8);
-
-    // Check if convertion is supported
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
-                             "Only data types supported [in] QS8 -> [out] F32");
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32),
-                             "Only data types supported [in] QS16 -> [out] F32");
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16),
-                             "Only data types supported [in] F32 -> [out] QS8, QS16");
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
-                                                                            && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
-                             "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
-                                                                             && output->info()->data_type() != DataType::S32),
-                             "Only data types supported [in] U16 ->  [out] U8, U32, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
-                                                                             && output->info()->data_type() != DataType::S32),
-                             "Only data types supported [in] S16 ->  [out] U8, U32, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
-                                                                             && output->info()->data_type() != DataType::S16),
-                             "Only data types supported [in] U32 ->  [out] U8, U16, S16");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
-                                                                             && output->info()->data_type() != DataType::S16),
-                             "Only data types supported [in] S32 ->  [out] U8, U16, S16");
-
-    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    // Get data sizes
-    const size_t input_size  = data_size_from_type(input->info()->data_type());
-    const size_t output_size = data_size_from_type(output->info()->data_type());
-
-    // Construct kernel name and build options
-    std::string           kernel_name = "convert_depth";
-    std::set<std::string> build_opts;
-    if(input_size > output_size)
-    {
-        kernel_name += "_down";
-        // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-        build_opts.insert(((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
-    }
-    else
-    {
-        kernel_name += "_up";
-    }
-    build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-
-    // Set shift arg
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, shift);
-
-    // Configure kernel
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
new file mode 100644
index 0000000000..83908a1469
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+
+    // Check if convertion is supported
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+                             "Only data types supported [in] QS8 -> [out] F32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32),
+                             "Only data types supported [in] QS16 -> [out] F32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16),
+                             "Only data types supported [in] F32 -> [out] QS8, QS16");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
+                                                                            && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] S16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] U32 ->  [out] U8, U16, S16");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] S32 ->  [out] U8, U16, S16");
+
+    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    // Get data sizes
+    const size_t input_size  = data_size_from_type(input->info()->data_type());
+    const size_t output_size = data_size_from_type(output->info()->data_type());
+
+    // Construct kernel name and build options
+    std::string           kernel_name = "convert_depth";
+    std::set<std::string> build_opts;
+    if(input_size > output_size)
+    {
+        kernel_name += "_down";
+        // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+        build_opts.insert(((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
+    }
+    else
+    {
+        kernel_name += "_up";
+    }
+    build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set shift arg
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
deleted file mode 100644
index e86c55fbc0..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-using namespace arm_compute;
-
-namespace
-{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-
-    return output_shape;
-}
-} // namespace
-
-CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
-{
-}
-
-BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(weights->info()->data_type()))
-        {
-            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       output_shape,
-                       1,
-                       input->info()->data_type(),
-                       input->info()->fixed_point_position(),
-                       input->info()->quantization_info());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    _input         = input;
-    _output        = output;
-    _weights       = weights;
-    _biases        = biases;
-    _conv_stride_x = conv_info.stride().first;
-    _conv_stride_y = conv_info.stride().second;
-    _conv_pad_left = conv_info.pad_left();
-    _conv_pad_top  = conv_info.pad_top();
-    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
-
-    // Set build options
-    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
-    // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set static arguments
-    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
-    {
-        float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0);
-
-        _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
-        _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
-        _kernel.setArg(idx++, _output->info()->quantization_info().offset);
-        _kernel.setArg(idx++, output_multiplier);
-        _kernel.setArg(idx++, output_shift);
-    }
-
-    // Configure the local work size for Bifrost with a value obtained
-    // via exhaustive autotuning for the MobileNets tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
-    {
-        const size_t width = input->info()->dimension(0);
-        if(width >= 56) // 56 or 112
-        {
-            _lws_hint = cl::NDRange(8, 5, 2);
-        }
-        else if(width >= 14) // 14 or 28
-        {
-            _lws_hint = cl::NDRange(1, 5, 2);
-        }
-        else // 7
-        {
-            _lws_hint = cl::NDRange(1, 1, 2);
-        }
-    }
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 2;
-    const unsigned int num_elems_written_per_iteration   = 2;
-    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
-    const unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
-}
-
-void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Create input window and adjust
-    Window win_in = window;
-    win_in.adjust(Window::DimX, -_conv_pad_left, true);
-    win_in.adjust(Window::DimY, -_conv_pad_top, true);
-    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-
-    Window slice_in      = win_in.first_slice_window_3D();
-    Window slice_out     = window.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    // Set biases
-    if(_biases != nullptr)
-    {
-        unsigned int idx = 3 * num_arguments_per_3D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _biases, slice_biases);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_weights);
-
-        enqueue(queue, *this, slice_out, _lws_hint);
-    }
-    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
new file mode 100644
index 0000000000..003f1f8330
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Calculates expected output shape dimension
+ *
+ * @param[in] Input shape
+ *
+ * @return Expected output shape
+ */
+TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+    TensorShape output_shape = input_shape;
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+
+    return output_shape;
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(weights->info()->data_type()))
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Get convolved dimensions
+    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input         = input;
+    _output        = output;
+    _weights       = weights;
+    _biases        = biases;
+    _conv_stride_x = conv_info.stride().first;
+    _conv_stride_y = conv_info.stride().second;
+    _conv_pad_left = conv_info.pad_left();
+    _conv_pad_top  = conv_info.pad_top();
+    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+
+    // Set build options
+    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+
+    // Create kernel
+    std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments
+    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
+    {
+        float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0);
+
+        _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
+        _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
+        _kernel.setArg(idx++, _output->info()->quantization_info().offset);
+        _kernel.setArg(idx++, output_multiplier);
+        _kernel.setArg(idx++, output_shift);
+    }
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        const size_t width = input->info()->dimension(0);
+        if(width >= 56) // 56 or 112
+        {
+            _lws_hint = cl::NDRange(8, 5, 2);
+        }
+        else if(width >= 14) // 14 or 28
+        {
+            _lws_hint = cl::NDRange(1, 5, 2);
+        }
+        else // 7
+        {
+            _lws_hint = cl::NDRange(1, 1, 2);
+        }
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 2;
+    const unsigned int num_elems_written_per_iteration   = 2;
+    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
+    const unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Create input window and adjust
+    Window win_in = window;
+    win_in.adjust(Window::DimX, -_conv_pad_left, true);
+    win_in.adjust(Window::DimY, -_conv_pad_top, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in      = win_in.first_slice_window_3D();
+    Window slice_out     = window.first_slice_window_3D();
+    Window slice_weights = window.first_slice_window_3D();
+    slice_weights.set_dimension_step(Window::DimX, 0);
+    slice_weights.set_dimension_step(Window::DimY, 0);
+
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        add_3D_tensor_argument(idx, _weights, slice_weights);
+
+        enqueue(queue, *this, slice_out, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/CL/kernels/CLL2NormalizeKernel.cpp b/src/core/CL/kernels/CLL2NormalizeKernel.cpp
deleted file mode 100644
index 3e0758c980..0000000000
--- a/src/core/CL/kernels/CLL2NormalizeKernel.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLL2NormalizeKernel::CLL2NormalizeKernel()
-    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
-{
-}
-
-void CLL2NormalizeKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    // Sum and output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    _input   = input;
-    _sum     = sum;
-    _output  = output;
-    _axis    = axis;
-    _epsilon = epsilon;
-
-    const unsigned int num_elems_processed_per_iteration = 16;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
-
-    // Set epsilon argument
-    unsigned int idx = num_arguments_per_1D_tensor() * 3;
-    _kernel.setArg<cl_uint>(idx, _epsilon);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
-}
-
-void CLL2NormalizeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window in_slice  = window.first_slice_window_1D();
-    Window sum_slice = window_sum.first_slice_window_1D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _sum, sum_slice);
-        add_1D_tensor_argument(idx, _output, in_slice);
-        enqueue(queue, *this, in_slice);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
new file mode 100644
index 0000000000..36e351e048
--- /dev/null
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Sum and output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _sum     = sum;
+    _output  = output;
+    _axis    = axis;
+    _epsilon = epsilon;
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+
+    // Set epsilon argument
+    unsigned int idx = num_arguments_per_1D_tensor() * 3;
+    _kernel.setArg<cl_uint>(idx, _epsilon);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_1D();
+    Window sum_slice = window_sum.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _sum, sum_slice);
+        add_1D_tensor_argument(idx, _output, in_slice);
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
deleted file mode 100644
index b90a8e7b89..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-GCDepthConcatenateKernel::GCDepthConcatenateKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
-{
-}
-
-BorderSize GCDepthConcatenateKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
-void GCDepthConcatenateKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
-
-    _input  = input;
-    _output = output;
-
-    // Add build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    // Configure kernel window
-    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
-    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
-
-    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
-    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
-    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
-
-    unsigned int num_elems_processed_per_iteration = 1;
-    unsigned int num_elems_read_per_iteration      = 1;
-    if(input->info()->data_type() == DataType::F32)
-    {
-        num_elems_processed_per_iteration = 1;
-        num_elems_read_per_iteration      = 1;
-    }
-    else if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_processed_per_iteration = 4;
-        num_elems_read_per_iteration      = 4;
-    }
-    const unsigned int num_rows_read_per_iteration = 1;
-
-    // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
-
-    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCDepthConcatenateKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        if(_input->info()->data_type() == DataType::F32)
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, 1, slice);
-            add_3D_tensor_argument(idx, _output, 2, slice);
-        }
-        else if(_input->info()->data_type() == DataType::F16)
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
-            add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
-        }
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000000..a6111782fd
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize GCDepthConcatenateLayerKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Add build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
+
+    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
+    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
+
+    unsigned int num_elems_processed_per_iteration = 1;
+    unsigned int num_elems_read_per_iteration      = 1;
+    if(input->info()->data_type() == DataType::F32)
+    {
+        num_elems_processed_per_iteration = 1;
+        num_elems_read_per_iteration      = 1;
+    }
+    else if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+        num_elems_read_per_iteration      = 4;
+    }
+    const unsigned int num_rows_read_per_iteration = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCDepthConcatenateLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        if(_input->info()->data_type() == DataType::F32)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, 1, slice);
+            add_3D_tensor_argument(idx, _output, 2, slice);
+        }
+        else if(_input->info()->data_type() == DataType::F16)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
+            add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+        }
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
deleted file mode 100644
index 7a62b0cb03..0000000000
--- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace
-{
-// Overloads of 128-bit vector loads
-uint8x16_t loadq(const uint8_t *ptr)
-{
-    return vld1q_u8(ptr);
-}
-uint16x8_t loadq(const uint16_t *ptr)
-{
-    return vld1q_u16(ptr);
-}
-uint32x4_t loadq(const uint32_t *ptr)
-{
-    return vld1q_u32(ptr);
-}
-// Overloads of 128-bit vector stores
-void storeq(uint8_t *ptr, uint8x16_t val)
-{
-    return vst1q_u8(ptr, val);
-}
-void storeq(uint16_t *ptr, uint16x8_t val)
-{
-    return vst1q_u16(ptr, val);
-}
-void storeq(uint32_t *ptr, uint32x4_t val)
-{
-    return vst1q_u32(ptr, val);
-}
-
-template <typename T>
-void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
-{
-    const int start_x = start_xy.first;
-    const int start_y = start_xy.second;
-
-    // Offset input
-    const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
-    uint8_t *input_ptr                               = in->buffer() + input_offset_to_first_elements_in_bytes;
-
-    // Offset output
-    const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
-    uint8_t           *output_ptr                               = out->buffer() + output_offset_to_first_elements_in_bytes;
-
-    Iterator input(in, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
-        const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
-
-        storeq(out_ptr, loadq(in_ptr));
-    },
-    input, output);
-}
-} // namespace
-
-NEDepthConcatenateKernel::NEDepthConcatenateKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
-{
-}
-
-BorderSize NEDepthConcatenateKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
-void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
-
-    _func         = nullptr;
-    _input        = input;
-    _output       = output;
-    _depth_offset = depth_offset;
-    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::QS8:
-            _func = &depth_concat<uint8_t>;
-            break;
-        case DataType::QS16:
-        case DataType::F16:
-            _func = &depth_concat<uint16_t>;
-            break;
-        case DataType::F32:
-            _func = &depth_concat<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
-    const unsigned int num_rows_read_per_iteration       = 1;
-
-    // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
-
-    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEDepthConcatenateKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
-}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000000..01b0f10f70
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+// Overloads of 128-bit vector loads
+uint8x16_t loadq(const uint8_t *ptr)
+{
+    return vld1q_u8(ptr);
+}
+uint16x8_t loadq(const uint16_t *ptr)
+{
+    return vld1q_u16(ptr);
+}
+uint32x4_t loadq(const uint32_t *ptr)
+{
+    return vld1q_u32(ptr);
+}
+// Overloads of 128-bit vector stores
+void storeq(uint8_t *ptr, uint8x16_t val)
+{
+    return vst1q_u8(ptr, val);
+}
+void storeq(uint16_t *ptr, uint16x8_t val)
+{
+    return vst1q_u16(ptr, val);
+}
+void storeq(uint32_t *ptr, uint32x4_t val)
+{
+    return vst1q_u32(ptr, val);
+}
+
+template <typename T>
+void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
+{
+    const int start_x = start_xy.first;
+    const int start_y = start_xy.second;
+
+    // Offset input
+    const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
+    uint8_t *input_ptr                               = in->buffer() + input_offset_to_first_elements_in_bytes;
+
+    // Offset output
+    const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
+    uint8_t           *output_ptr                               = out->buffer() + output_offset_to_first_elements_in_bytes;
+
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
+        const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+
+        storeq(out_ptr, loadq(in_ptr));
+    },
+    input, output);
+}
+} // namespace
+
+NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize NEDepthConcatenateLayerKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _func         = nullptr;
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &depth_concat<uint8_t>;
+            break;
+        case DataType::QS16:
+        case DataType::F16:
+            _func = &depth_concat<uint16_t>;
+            break;
+        case DataType::F32:
+            _func = &depth_concat<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
+}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
deleted file mode 100644
index d97a20be65..0000000000
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-NEDepthConvertKernel::NEDepthConvertKernel()
-    : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
-{
-}
-
-void NEDepthConvertKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
-
-    _input  = input;
-    _output = input;
-    _policy = policy;
-    _shift  = shift;
-
-    if(output != nullptr)
-    {
-        // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
-        set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-        // Set output
-        _output = output;
-    }
-
-    // Set initial fixed point position of input and output
-    _fixed_point_position_input  = input->info()->fixed_point_position();
-    _fixed_point_position_output = _output->info()->fixed_point_position();
-
-    // Set the fixed point position to the output tensor if needed
-    if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
-    {
-        // If in-place set the fixed point position of the output tensor to be equal to shift
-        _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
-        // Set fixed point position to output tensor
-        _output->info()->set_fixed_point_position(_fixed_point_position_output);
-    }
-
-    ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
-    ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
-                                                                            && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS8 ->  [out] QS8, F32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
-                             "Only data_types supported [in] U16 ->  [out] U8, U32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] S16 ->  [out] U8, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS16 ->  [out] QS16, F32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
-                             "Only data_types supported [in] F32 ->  [out] QS8, QS16");
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        // In-place computation
-        update_window_and_padding(win, input_access);
-    }
-    ICPPKernel::configure(win);
-}
-
-void NEDepthConvertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(nullptr == _input);
-    ARM_COMPUTE_ERROR_ON(nullptr == _output);
-    ARM_COMPUTE_ERROR_ON(_input == _output);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    bool in_place = (_input == _output);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::U8:
-        {
-            const int16x8_t b = vdupq_n_s16(_shift);
-
-            switch(_output->info()->data_type())
-            {
-                case DataType::S16:
-                {
-                    /* Up-conversion U8 -> S16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
-
-                        const int16x8x2_t texels =
-                        {
-                            {
-                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
-                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
-                            }
-                        };
-
-                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
-                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
-                    },
-                    input, output);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion U8 -> S32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
-
-                        const int16x8x2_t texels =
-                        {
-                            {
-                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
-                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
-                            }
-                        };
-
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                    },
-                    input, output);
-                    break;
-                }
-                case DataType::U16:
-                {
-                    /* Up-conversion U8 -> U16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
-
-                        const uint16x8x2_t texels =
-                        {
-                            {
-                                vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
-                                vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
-                            }
-                        };
-
-                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
-                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::QS8:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS8:
-                {
-                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
-                    /* Fixed point position conversion QS8 -> QS8 */
-                    if(relative_shift != 0 || !in_place)
-                    {
-                        const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
-                            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QS8 -> F32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
-
-                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
-                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::S16:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                {
-                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
-
-                    /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
-                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
-                                }
-                            };
-
-                            vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                        },
-                        input, output);
-                    }
-                    else
-                    {
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
-                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
-                                }
-                            };
-
-                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                               vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::S32:
-                {
-                    const int32x4_t b = vdupq_n_s32(_shift);
-
-                    /* Up-conversion S16 -> S32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const int16x8x2_t texels =
-                        {
-                            {
-                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
-                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
-                            }
-                        };
-
-                        const int32x4x4_t texels_s32 =
-                        {
-                            {
-                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
-                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
-                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
-                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
-                            }
-                        };
-
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::U16:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                {
-                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
-
-                    /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
-                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
-                                }
-                            };
-
-                            vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                        },
-                        input, output);
-                    }
-                    else
-                    {
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
-                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
-                                }
-                            };
-
-                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::U32:
-                {
-                    const int32x4_t b = vdupq_n_s32(_shift);
-
-                    /* Up-conversion U16 -> U32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const uint16x8x2_t texels =
-                        {
-                            {
-                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
-                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
-                            }
-                        };
-
-                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
-                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
-                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
-                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::QS16:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS16:
-                {
-                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
-                    /* Fixed point position conversion QS16 -> QS16 */
-                    if(relative_shift != 0 || !in_place)
-                    {
-                        const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            const qint16x8x2_t texels_qs16 =
-                            {
-                                {
-                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
-                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
-                                }
-                            };
-                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
-                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
-                        },
-                        input, output);
-                    }
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QS16 -> F32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const int16x8x2_t texels_qs16 =
-                        {
-                            {
-                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
-                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
-                            }
-                        };
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        case DataType::F32:
-        {
-            switch(_output->info()->data_type())
-            {
-                case DataType::QS8:
-                {
-                    /* Down-conversion F32 -> QS8 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const float32x4x4_t texels_f32 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
-                            }
-                        };
-
-                        const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
-
-                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
-                    },
-                    input, output);
-                    break;
-                }
-                case DataType::QS16:
-                {
-                    /* Down-conversion F32 -> QS16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const float32x4x2_t texels_f32_1 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
-                            }
-                        };
-                        const float32x4x2_t texels_f32_2 =
-                        {
-                            {
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
-                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
-                            }
-                        };
-
-                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
-                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-}
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
new file mode 100644
index 0000000000..c29cb57513
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
+    : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
+{
+}
+
+void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+
+    _input  = input;
+    _output = input;
+    _policy = policy;
+    _shift  = shift;
+
+    if(output != nullptr)
+    {
+        // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+        set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+        // Set output
+        _output = output;
+    }
+
+    // Set initial fixed point position of input and output
+    _fixed_point_position_input  = input->info()->fixed_point_position();
+    _fixed_point_position_output = _output->info()->fixed_point_position();
+
+    // Set the fixed point position to the output tensor if needed
+    if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
+    {
+        // If in-place set the fixed point position of the output tensor to be equal to shift
+        _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
+        // Set fixed point position to output tensor
+        _output->info()->set_fixed_point_position(_fixed_point_position_output);
+    }
+
+    ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
+    ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
+                                                                            && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS8 ->  [out] QS8, F32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
+                             "Only data_types supported [in] U16 ->  [out] U8, U32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] S16 ->  [out] U8, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS16 ->  [out] QS16, F32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+                             "Only data_types supported [in] F32 ->  [out] QS8, QS16");
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        // In-place computation
+        update_window_and_padding(win, input_access);
+    }
+    ICPPKernel::configure(win);
+}
+
+void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(nullptr == _input);
+    ARM_COMPUTE_ERROR_ON(nullptr == _output);
+    ARM_COMPUTE_ERROR_ON(_input == _output);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    bool in_place = (_input == _output);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+        {
+            const int16x8_t b = vdupq_n_s16(_shift);
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::S16:
+                {
+                    /* Up-conversion U8 -> S16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion U8 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::U16:
+                {
+                    /* Up-conversion U8 -> U16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
+                                vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
+                            }
+                        };
+
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::QS8:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+                    /* Fixed point position conversion QS8 -> QS8 */
+                    if(relative_shift != 0 || !in_place)
+                    {
+                        const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+                            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion QS8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+
+                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
+                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::S16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion S16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                               vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::S32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion S16 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        const int32x4x4_t texels_s32 =
+                        {
+                            {
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::U16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion U16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::U32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion U16 -> U32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::QS16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS16:
+                {
+                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+                    /* Fixed point position conversion QS16 -> QS16 */
+                    if(relative_shift != 0 || !in_place)
+                    {
+                        const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const qint16x8x2_t texels_qs16 =
+                            {
+                                {
+                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
+                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+                                }
+                            };
+                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
+                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion QS16 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels_qs16 =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::F32:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    /* Down-conversion F32 -> QS8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels_f32 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
+
+                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::QS16:
+                {
+                    /* Down-conversion F32 -> QS16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x2_t texels_f32_1 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                            }
+                        };
+                        const float32x4x2_t texels_f32_2 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp
deleted file mode 100644
index 5c4bd34e05..0000000000
--- a/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-using namespace arm_compute::detail;
-
-NEDepthwiseConvolution3x3Kernel::NEDepthwiseConvolution3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info()
-{
-}
-
-BorderSize NEDepthwiseConvolution3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEDepthwiseConvolution3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
-                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
-                                                                              conv_info);
-
-    ARM_COMPUTE_UNUSED(expected_output);
-    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
-    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
-
-    _input                           = input;
-    _output                          = output;
-    _weights                         = weights;
-    _conv_info                       = conv_info;
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_pad_x    = conv_info.pad().first;
-    const unsigned int conv_pad_y    = conv_info.pad().second;
-
-    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
-
-    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-    _border_size                                       = BorderSize(conv_pad_y, conv_pad_x);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration));
-
-    AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input->info()->dimension(0) + _border_size.right, input->info()->dimension(1) + _border_size.bottom);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-    {
-        const int          input_stride_x  = input->info()->strides_in_bytes().x();
-        const int          input_stride_y  = input->info()->strides_in_bytes().y();
-        const int          output_stride_y = output->info()->strides_in_bytes().y();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          output_w        = output->info()->dimension(0);
-        const int          output_h        = output->info()->dimension(1);
-        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_x      = std::get<0>(conv_info.pad());
-        const unsigned int conv_pad_y      = std::get<1>(conv_info.pad());
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator in(input, window_in);
-        Iterator out(output, window_out);
-        Iterator w(weights, window_k);
-
-        const uint8_t *weights_ptr = w.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
-            int            ih        = 0;
-            int            oh        = 0;
-
-            const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
-            const auto     ptr_weights_r0   = reinterpret_cast<const float *>(ptr_weights_base);
-            const auto     ptr_weights_r1   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
-            const auto     ptr_weights_r2   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
-            const auto     vw_r0            = load_matrix_row(ptr_weights_r0);
-            const auto     vw_r1            = load_matrix_row(ptr_weights_r1);
-            const auto     vw_r2            = load_matrix_row(ptr_weights_r2);
-
-            for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-            {
-                auto in_top = reinterpret_cast<const float *>(input_ptr + (ih + 0) * input_stride_y);
-                auto in_mid = reinterpret_cast<const float *>(input_ptr + (ih + 1) * input_stride_y);
-                auto in_low = reinterpret_cast<const float *>(input_ptr + (ih + 2) * input_stride_y);
-                auto p_out  = reinterpret_cast<float *>(out.ptr() + oh * output_stride_y);
-
-                for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                    in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                {
-                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0);
-                    store_results<stridex>(p_out, vres);
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-void NEDepthwiseConvolution3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(info);
-
-    const unsigned int conv_stride_x                   = _conv_info.stride().first;
-    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            break;
-        case 2:
-            convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            break;
-        case 3:
-            convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
new file mode 100644
index 0000000000..02962e0492
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+using namespace arm_compute::detail;
+
+NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _conv_info()
+{
+}
+
+BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
+                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
+                                                                              conv_info);
+
+    ARM_COMPUTE_UNUSED(expected_output);
+    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
+    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
+
+    _input                           = input;
+    _output                          = output;
+    _weights                         = weights;
+    _conv_info                       = conv_info;
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    const unsigned int conv_pad_x    = conv_info.pad().first;
+    const unsigned int conv_pad_y    = conv_info.pad().second;
+
+    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
+
+    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+    _border_size                                       = BorderSize(conv_pad_y, conv_pad_x);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input->info()->dimension(0) + _border_size.right, input->info()->dimension(1) + _border_size.bottom);
+    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          output_w        = output->info()->dimension(0);
+        const int          output_h        = output->info()->dimension(1);
+        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x      = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y      = std::get<1>(conv_info.pad());
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator in(input, window_in);
+        Iterator out(output, window_out);
+        Iterator w(weights, window_k);
+
+        const uint8_t *weights_ptr = w.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            int            ih        = 0;
+            int            oh        = 0;
+
+            const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
+            const auto     ptr_weights_r0   = reinterpret_cast<const float *>(ptr_weights_base);
+            const auto     ptr_weights_r1   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
+            const auto     ptr_weights_r2   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
+            const auto     vw_r0            = load_matrix_row(ptr_weights_r0);
+            const auto     vw_r1            = load_matrix_row(ptr_weights_r1);
+            const auto     vw_r2            = load_matrix_row(ptr_weights_r2);
+
+            for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+            {
+                auto in_top = reinterpret_cast<const float *>(input_ptr + (ih + 0) * input_stride_y);
+                auto in_mid = reinterpret_cast<const float *>(input_ptr + (ih + 1) * input_stride_y);
+                auto in_low = reinterpret_cast<const float *>(input_ptr + (ih + 2) * input_stride_y);
+                auto p_out  = reinterpret_cast<float *>(out.ptr() + oh * output_stride_y);
+
+                for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                    in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                {
+                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0);
+                    store_results<stridex>(p_out, vres);
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(info);
+
+    const unsigned int conv_stride_x                   = _conv_info.stride().first;
+    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        case 2:
+            convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        case 3:
+            convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
diff --git a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
deleted file mode 100644
index 12c532afd5..0000000000
--- a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cmath>
-
-using namespace arm_compute;
-
-namespace
-{
-void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window in_slice  = window.first_slice_window_1D();
-    Window sum_slice = window_sum.first_slice_window_1D();
-
-    do
-    {
-        Iterator input_it(in, in_slice);
-        Iterator sum_it(sum, sum_slice);
-        Iterator output_it(out, in_slice);
-
-        const float       sum_value           = *reinterpret_cast<const float *>(sum_it.ptr());
-        const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float *>(input_it.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
-
-            vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
-        },
-        input_it, output_it);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-} // namespace
-
-NEL2NormalizeKernel::NEL2NormalizeKernel()
-    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
-{
-}
-
-void NEL2NormalizeKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->info()->data_type());
-    unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
-
-    _input   = input;
-    _sum     = sum;
-    _output  = output;
-    _axis    = axis;
-    _epsilon = epsilon;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, sum_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEL2NormalizeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_axis)
-    {
-        case 0:
-            l2_normalize_X(_input, _sum, _output, _epsilon, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported normalization axis");
-    }
-}
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
new file mode 100644
index 0000000000..3bf1d9400e
--- /dev/null
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_1D();
+    Window sum_slice = window_sum.first_slice_window_1D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        const float       sum_value           = *reinterpret_cast<const float *>(sum_it.ptr());
+        const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+
+            vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+        },
+        input_it, output_it);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
+
+NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->info()->data_type());
+    unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+
+    _input   = input;
+    _sum     = sum;
+    _output  = output;
+    _axis    = axis;
+    _epsilon = epsilon;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_axis)
+    {
+        case 0:
+            l2_normalize_X(_input, _sum, _output, _epsilon, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported normalization axis");
+    }
+}
diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
index 647f88f0e2..8f2bf23ce3 100644
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ b/src/graph/operations/CLSimpleOperations.cpp
@@ -106,7 +106,7 @@ REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationT
     return std::move(batch_norm);
 }
 
-/* DepthConvert Layer */
+/* DepthConvertLayer Layer */
 REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::DepthConvertLayer)
 {
     ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
@@ -121,7 +121,7 @@ REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::D
     const auto shift       = ctx.parameter<uint32_t>("shift");
 
     // Create and configure function
-    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthConvert>();
+    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthConvertLayer>();
     depthconvert->configure(in, out, conv_policy, shift);
 
     // Log info
@@ -156,13 +156,13 @@ REGISTER_SIMPLE_OPERATION(CLDepthwiseConvolutionOperation, OPENCL, OperationType
     bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
     if(run_3x3_opt)
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolution>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }
     else
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolution3x3>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }
@@ -313,7 +313,7 @@ REGISTER_SIMPLE_OPERATION(CLL2NormalizeLayerOperation, OPENCL, OperationType::L2
     const auto epsilon = ctx.parameter<float>("epsilon");
 
     // Create and configure function
-    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLL2Normalize>();
+    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLL2NormalizeLayer>();
     l2_norm->configure(in, out, axis, epsilon);
 
     // Log info
diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
index f234341cec..bb99e8da4b 100644
--- a/src/graph/operations/NESimpleOperations.cpp
+++ b/src/graph/operations/NESimpleOperations.cpp
@@ -106,7 +106,7 @@ REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationTyp
     return std::move(batch_norm);
 }
 
-/* DepthConvert Layer */
+/* DepthConvertLayer Layer */
 REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::DepthConvertLayer)
 {
     ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
@@ -121,7 +121,7 @@ REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::Dep
     const auto shift       = ctx.parameter<uint32_t>("shift");
 
     // Create and configure function
-    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthConvert>();
+    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthConvertLayer>();
     depthconvert->configure(in, out, conv_policy, shift);
 
     // Log info
@@ -156,13 +156,13 @@ REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType::
     bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
     if(run_3x3_opt)
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolution>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }
     else
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolution3x3>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }
@@ -313,7 +313,7 @@ REGISTER_SIMPLE_OPERATION(NEL2NormalizeLayerOperation, NEON, OperationType::L2No
     const auto epsilon = ctx.parameter<float>("epsilon");
 
     // Create and configure function
-    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEL2Normalize>();
+    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEL2NormalizeLayer>();
     l2_norm->configure(in, out, axis, epsilon);
 
     // Log info
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
deleted file mode 100644
index 89e44ca98e..0000000000
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthConcatenate::CLDepthConcatenate() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-
-    unsigned int depth_offset = 0;
-
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
-
-    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-}
-
-void CLDepthConcatenate::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-    }
-}
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
new file mode 100644
index 0000000000..05b5d54cf7
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
+{
+}
+
+void CLDepthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void CLDepthConcatenateLayer::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
deleted file mode 100644
index b64d05b8b1..0000000000
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
new file mode 100644
index 0000000000..b448465909
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
deleted file mode 100644
index 81149508dd..0000000000
--- a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
-    : _kernel(), _border_handler()
-{
-}
-
-void CLDepthwiseConvolution3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-
-    _kernel.set_target(CLScheduler::get().target());
-    _kernel.configure(input, weights, biases, output, conv_info);
-
-    // Configure border handler
-    PixelValue &&zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
-    {
-        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
-    }
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
-}
-
-void CLDepthwiseConvolution3x3::run()
-{
-    CLScheduler::get().enqueue(_border_handler);
-    CLScheduler::get().enqueue(_kernel);
-}
-
-CLDepthwiseConvolution::CLDepthwiseConvolution()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
-      _v2mm_output()
-{
-}
-
-void CLDepthwiseConvolution::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
-
-    const size_t weights_w = weights->info()->dimension(0);
-    const size_t weights_h = weights->info()->dimension(1);
-    const size_t weights_z = weights->info()->dimension(2);
-
-    const bool      has_bias   = (biases != nullptr);
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
-
-    // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
-    const size_t conv_size  = conv_w * conv_h;
-
-    // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _input_reshaped.allocator()->init(info_im2col);
-    _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
-
-    // Weights reshape configuration
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
-    _weights_reshaped.allocator()->init(info_weights_reshape);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
-
-    // GEMV configuration
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _v2mm_output.allocator()->init(info_v2mm_out);
-    _v2mm_kernel.set_target(gpu_target);
-    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
-
-    BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
-
-    border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
-
-    // Allocate intermediate tensors
-    _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
-    _v2mm_output.allocator()->allocate();
-}
-
-void CLDepthwiseConvolution::run()
-{
-    CLScheduler::get().enqueue(_im2col_kernel);
-
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-    CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-    CLScheduler::get().enqueue(_v2mm_kernel);
-
-    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000000..02273fe08b
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
+    : _kernel(), _border_handler()
+{
+}
+
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    _kernel.set_target(CLScheduler::get().target());
+    _kernel.configure(input, weights, biases, output, conv_info);
+
+    // Configure border handler
+    PixelValue &&zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+    }
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+void CLDepthwiseConvolutionLayer3x3::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(_kernel);
+}
+
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
+      _v2mm_output()
+{
+}
+
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+    const size_t weights_w = weights->info()->dimension(0);
+    const size_t weights_h = weights->info()->dimension(1);
+    const size_t weights_z = weights->info()->dimension(2);
+
+    const bool      has_bias   = (biases != nullptr);
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+    // Set up intermediate tensors
+    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+    const size_t conv_size  = conv_w * conv_h;
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_reshaped.allocator()->init(info_im2col);
+    _im2col_kernel.set_target(gpu_target);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+    // GEMV configuration
+    TensorShape shape_v2mm_out = input->info()->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _v2mm_output.allocator()->init(info_v2mm_out);
+    _v2mm_kernel.set_target(gpu_target);
+    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+    BorderSize border_size = _v2mm_kernel.border_size();
+    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    border_size.bottom = 0;
+    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate tensors
+    _input_reshaped.allocator()->allocate();
+    _weights_reshaped.allocator()->allocate();
+    _v2mm_output.allocator()->allocate();
+}
+
+void CLDepthwiseConvolutionLayer::run()
+{
+    CLScheduler::get().enqueue(_im2col_kernel);
+
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+    CLScheduler::get().enqueue(_v2mm_input_fill_border);
+    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+    CLScheduler::get().enqueue(_v2mm_kernel);
+
+    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+}
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
deleted file mode 100644
index 99be8cae4c..0000000000
--- a/src/runtime/CL/functions/CLL2Normalize.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
-{
-}
-
-void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
-{
-    // Manage intermediate buffers
-    _memory_group.manage(&_sumsq);
-
-    // Configure kernels
-    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
-
-    // Allocate intermediate tensor
-    _sumsq.allocator()->allocate();
-}
-
-void CLL2Normalize::run()
-{
-    _memory_group.acquire();
-
-    _reduce_func.run();
-    CLScheduler::get().enqueue(_normalize_kernel, true);
-
-    _memory_group.release();
-}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
new file mode 100644
index 0000000000..d1bb65f1c9
--- /dev/null
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensor
+    _sumsq.allocator()->allocate();
+}
+
+void CLL2NormalizeLayer::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    CLScheduler::get().enqueue(_normalize_kernel, true);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index a395487103..7e5278f380 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -29,7 +29,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "support/ToolchainSupport.h"
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp
deleted file mode 100755
index ed756cf261..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-GCDepthConcatenate::GCDepthConcatenate() //NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void GCDepthConcatenate::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output) //NOLINT
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-
-    unsigned int depth_offset = 0;
-
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<GCDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-}
-
-void GCDepthConcatenate::run()
-{
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        GCScheduler::get().enqueue(_border_handlers_vector[i], false);
-        GCScheduler::get().enqueue(_concat_kernels_vector[i], true);
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
new file mode 100755
index 0000000000..ee0b121695
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCDepthConcatenateLayer::GCDepthConcatenateLayer() //NOLINT
+    : _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
+{
+}
+
+void GCDepthConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output) //NOLINT
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<GCDepthConcatenateLayerKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void GCDepthConcatenateLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        GCScheduler::get().enqueue(_border_handlers_vector[i], false);
+        GCScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
deleted file mode 100644
index f8ad2abe61..0000000000
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthConcatenate::NEDepthConcatenate() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
-
-    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
-
-    unsigned int depth_offset = 0;
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-}
-
-void NEDepthConcatenate::run()
-{
-    for(unsigned i = 0; i < _num_inputs; ++i)
-    {
-        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
-        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
new file mode 100644
index 0000000000..437c9417ce
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
+{
+}
+
+void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs             = inputs_vector.size();
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
+    unsigned int depth_offset = 0;
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void NEDepthConcatenateLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; ++i)
+    {
+        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
deleted file mode 100644
index 37857b6534..0000000000
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
-
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
new file mode 100644
index 0000000000..9a75404fcd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp
deleted file mode 100644
index e12bc07464..0000000000
--- a/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolution.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthwiseConvolution3x3::NEDepthwiseConvolution3x3()
-    : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
-{
-}
-
-void NEDepthwiseConvolution3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-    // Call convolution kernel
-    _kernel.configure(input, weights, output, conv_info);
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-    if(biases != nullptr)
-    {
-        _bias_kernel.configure(output, biases);
-        _has_bias = true;
-    }
-}
-
-void NEDepthwiseConvolution3x3::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
-    if(_has_bias)
-    {
-        NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
-    }
-}
-
-NEDepthwiseConvolution::NEDepthwiseConvolution()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
-{
-}
-
-void NEDepthwiseConvolution::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
-
-    const size_t weights_w = weights->info()->dimension(0);
-    const size_t weights_h = weights->info()->dimension(1);
-    const size_t weights_z = weights->info()->dimension(2);
-
-    bool has_bias = (biases != nullptr);
-
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
-
-    // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
-    const size_t conv_size  = conv_w * conv_h;
-
-    // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _input_reshaped.allocator()->init(info_im2col);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
-
-    // Weights reshape configuration
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
-    _weights_reshaped.allocator()->init(info_weights_reshape);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
-
-    // GEMV configuration
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _v2mm_output.allocator()->init(info_v2mm_out);
-    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
-
-    // Allocate intermediate tensors
-    _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
-    _v2mm_output.allocator()->allocate();
-}
-
-void NEDepthwiseConvolution::run()
-{
-    NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
-}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000000..b890c6f5d5
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
+    : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
+{
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+    // Call convolution kernel
+    _kernel.configure(input, weights, output, conv_info);
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    if(biases != nullptr)
+    {
+        _bias_kernel.configure(output, biases);
+        _has_bias = true;
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
+    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+    }
+}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
+{
+}
+
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+    const size_t weights_w = weights->info()->dimension(0);
+    const size_t weights_h = weights->info()->dimension(1);
+    const size_t weights_z = weights->info()->dimension(2);
+
+    bool has_bias = (biases != nullptr);
+
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+    // Set up intermediate tensors
+    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+    const size_t conv_size  = conv_w * conv_h;
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_reshaped.allocator()->init(info_im2col);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+    // GEMV configuration
+    TensorShape shape_v2mm_out = input->info()->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _v2mm_output.allocator()->init(info_v2mm_out);
+    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+    // Allocate intermediate tensors
+    _input_reshaped.allocator()->allocate();
+    _weights_reshaped.allocator()->allocate();
+    _v2mm_output.allocator()->allocate();
+}
+
+void NEDepthwiseConvolutionLayer::run()
+{
+    NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp
deleted file mode 100644
index 349a781b0b..0000000000
--- a/src/runtime/NEON/functions/NEL2Normalize.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
-{
-}
-
-void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
-{
-    // Manage intermediate buffers
-    _memory_group.manage(&_sumsq);
-
-    // Configure Kernels
-    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
-
-    // Allocate intermediate tensors
-    _sumsq.allocator()->allocate();
-}
-
-void NEL2Normalize::run()
-{
-    _memory_group.acquire();
-
-    _reduce_func.run();
-    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
-
-    _memory_group.release();
-}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
new file mode 100644
index 0000000000..fa62483146
--- /dev/null
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure Kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensors
+    _sumsq.allocator()->allocate();
+}
+
+void NEL2NormalizeLayer::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index a680f1f11d..0e149d4176 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
-- 
cgit v1.2.1