COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
author: Anthony Barbier <anthony.barbier@arm.com> 2017-09-04 18:44:23 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-09-17 13:03:09 +0100
commit: 6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree: a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src/core/CL/cl_kernels
download: ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
54 files changed, 12788 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
new file mode 100644
index 0000000000..1761342eb4
--- /dev/null
+++ b/src/core/CL/cl_kernels/absdiff.cl
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void absdiff(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
new file mode 100644
index 0000000000..39c1512c3c
--- /dev/null
+++ b/src/core/CL/cl_kernels/accumulate.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function accumulates an input image into output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ */
+__kernel void accumulate(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu))
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    uchar16 in_data   = vload16(0, input.ptr);
+    short16 accu_data = vload16(0, (__global short *)accu.ptr);
+
+    // Perform accumulation
+    short16 res = add_sat(convert_short16(in_data), accu_data);
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
+
+/** This function accumulates a weighted value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
+ */
+__kernel void accumulate_weighted(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const float alpha)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    const float16 in_data   = convert_float16(vload16(0, input.ptr));
+    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
+
+    // Calculate weighted accumulation
+    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
+
+    // Store result
+    vstore16(res, 0, accu.ptr);
+}
+
+/** This function accumulates a squared value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
+ */
+__kernel void accumulate_squared(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const uint shift)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
+    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
+
+    // Calculate squared accumulation
+    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
new file mode 100644
index 0000000000..e3cbb6c801
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
+ * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
+ * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+#if defined LOGISTIC
+    data = 1 / (1 + exp(-data));
+#elif defined TANH
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
+#elif defined RELU
+    data = max(0, data);
+#elif defined BRELU
+    data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data));
+#elif defined SRELU
+    data = log(1 + exp(data));
+#elif defined ABS
+#if defined   TYPE_INT
+    data = abs(data);
+#else
+    data = fabs(data);
+#endif
+#elif defined SQUARE
+    data = data * data;
+#elif defined SQRT
+    data = sqrt(data);
+#elif defined LINEAR
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B;
+#endif
+
+    // Store result
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
new file mode 100644
index 0000000000..434300efa8
--- /dev/null
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif
+
+/** This function add two images.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_add(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function subtracts one image from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_sub(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
new file mode 100644
index 0000000000..13e6702334
--- /dev/null
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: F32
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: F32
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: F32
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+                                       TENSOR3D_DECLARATION(output),
+                                       VECTOR_DECLARATION(mean),
+                                       VECTOR_DECLARATION(var),
+                                       VECTOR_DECLARATION(beta),
+                                       VECTOR_DECLARATION(gamma),
+                                       float epsilon)
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+    float4 _in         = 0;
+    float4 denominator = 0;
+    float4 numerator   = 0;
+    float4 x_bar       = 0;
+    float4 gamma_vec   = 0;
+    float4 beta_vec    = 0;
+
+    const int current_slice = get_global_id(2);
+
+    _in         = vload4(0, (__global float *)in.ptr);
+    denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
+    denominator = rsqrt(denominator + epsilon);
+
+    // Calculate x bar and store results
+    numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = _in - numerator;
+    x_bar     = numerator * denominator;
+
+    gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
+    beta_vec  = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+
+    vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
new file mode 100644
index 0000000000..135bfa989c
--- /dev/null
+++ b/src/core/CL/cl_kernels/bitwise_op.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a | in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a & in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a ^ in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise NOT of an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_data = vload16(0, in.ptr);
+
+    vstore16(~in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
new file mode 100644
index 0000000000..ec6719213c
--- /dev/null
+++ b/src/core/CL/cl_kernels/canny.cl
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 1 normalisation.
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L1(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
+
+    /* Calculate the gradient, using level 1 normalisation method */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
+    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
+
+    /* Calculate the angle */
+    float4 p = atan2pi(convert_float4(v), convert_float4(h));
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 2 normalisation
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L2(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
+    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
+
+    /* Calculate the gradient, using level 2 normalisation method */
+    float4 m = sqrt(h * h + v * v);
+
+    /* Calculate the angle */
+    float4 p = atan2pi(v, h);
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Array that holds the relative coordinates offset for the neighbouring pixels.
+ */
+__constant short4 neighbours_coords[] =
+{
+    { -1, 0, 1, 0 },  // 0
+    { -1, 1, 1, -1 }, // 45
+    { 0, 1, 0, -1 },  // 90
+    { 1, 1, -1, -1 }, // 135
+    { 1, 0, -1, 0 },  // 180
+    { 1, -1, -1, 1 }, // 225
+    { 0, 1, 0, -1 },  // 270
+    { -1, -1, 1, 1 }, // 315
+    { -1, 0, 1, 0 },  // 360
+};
+
+/** Perform non maximum suppression.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
+ * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
+ * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
+ * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
+ * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  lower_thr                             The low threshold
+ */
+__kernel void suppress_non_maximum(
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle),
+    IMAGE_DECLARATION(non_max),
+    uint lower_thr)
+{
+    // Construct images
+    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
+    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
+
+    // Get gradient and angle
+    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
+    uchar an              = convert_ushort(*angle.ptr);
+
+    if(gradient <= lower_thr)
+    {
+        return;
+    }
+
+    // Divide the whole round into 8 directions
+    uchar         ang  = 127 - an;
+    DATA_TYPE_OUT q_an = (ang + 16) >> 5;
+
+    // Find the two pixels in the perpendicular direction
+    short2       x_p = neighbours_coords[q_an].s02;
+    short2       y_p = neighbours_coords[q_an].s13;
+    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
+    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
+
+    if((gradient > g1) && (gradient > g2))
+    {
+        *((global DATA_TYPE_OUT *)non_max.ptr) = gradient;
+    }
+}
+
+#define EDGE 255
+#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
+
+/** Check whether pixel is valid
+*
+* Skip the pixel if the early_test fails.
+* Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
+*
+* @param[in] early_test Boolean condition based on the minv check and visited buffer check
+* @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] x_cur      X-coordinate of current central pixel
+* @param[in] y_cur      Y-coordinate of current central pixel
+*/
+#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
+    {                                                                                     \
+        if(!early_test)                                                                   \
+        {                                                                                 \
+            /* Number of elements in the local stack 1, points to next available entry */ \
+            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
+            \
+            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
+                goto pop_stack;                                                           \
+            \
+            /* The pixel that has already been recorded is ignored */                     \
+            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
+            {                                                                             \
+                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
+                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
+            }                                                                             \
+        }                                                                                 \
+    }
+
+/** Perform hysteresis.
+ *
+ * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
+ *
+ * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
+ * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
+ * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
+ * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
+ * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
+ * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
+ * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
+ * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
+ * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
+ * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  low_thr                                        The lower threshold
+ * @param[in]  up_thr                                         The upper threshold
+ * @param[in]  width                                          The width of the image.
+ * @param[in]  height                                         The height of the image
+ */
+kernel void hysteresis(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(visited),
+    IMAGE_DECLARATION(recorded),
+    IMAGE_DECLARATION(l1_stack),
+    IMAGE_DECLARATION(l1_stack_counter),
+    uint low_thr,
+    uint up_thr,
+    int  width,
+    int  height)
+{
+    // Create images
+    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
+    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
+    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
+    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
+    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
+
+    // Index
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Load value
+    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
+
+    // If less than upper threshold set to NO_EDGE and return
+    if(val <= up_thr)
+    {
+        *offset(&out, x, y) = 0;
+        return;
+    }
+
+    // Init local stack 2
+    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
+    int    L2_counter                          = 0;
+
+    // Perform recursive hysteresis
+    while(true)
+    {
+        // Get L1 stack pointer
+        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
+
+        // If the pixel has already been visited, proceed with the items in the stack instead
+        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
+        {
+            goto pop_stack;
+        }
+
+        // Set strong edge
+        *offset(&out, x, y) = EDGE;
+
+        // If it is the top of stack l2, we don't need check the surrounding pixels
+        if(L2_counter > (hysteresis_local_stack_L2 - 1))
+        {
+            goto pop_stack2;
+        }
+
+        // Points to the start of the local stack;
+        char c;
+
+        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+        x_tmp;
+        uint4 v_tmp;
+
+        // Get direction pixel indices
+        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
+
+        // Check 8 pixels around for week edges where low_thr < val <= up_thr
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
+
+#undef check_pixel
+
+pop_stack:
+        c = *((__global char *)offset(&l1_stack_counter, x, y));
+
+        if(c >= 1)
+        {
+            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
+            int2 l_c = convert_int2(l1_ptr[c - 1]);
+
+            // Push the current position into level 2 stack
+            stack_L2[L2_counter].x = x;
+            stack_L2[L2_counter].y = y;
+
+            x = l_c.x;
+            y = l_c.y;
+
+            L2_counter++;
+
+            continue;
+        }
+
+        if(L2_counter > 0)
+        {
+            goto pop_stack2;
+        }
+        else
+        {
+            return;
+        }
+
+pop_stack2:
+        L2_counter--;
+        x = stack_L2[L2_counter].x;
+        y = stack_L2[L2_counter].y;
+    };
+}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
new file mode 100644
index 0000000000..93e80b925e
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_combine.cl
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function combines three planes to a single RGB image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGB888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
+                             data0.s1, data1.s1, data2.s1,
+                             data0.s2, data1.s2, data2.s2,
+                             data0.s3, data1.s3, data2.s3,
+                             data0.s4, data1.s4, data2.s4,
+                             data0.s5);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
+                             data1.s6, data2.s6, data0.s7,
+                             data1.s7, data2.s7, data0.s8,
+                             data1.s8, data2.s8, data0.s9,
+                             data1.s9, data2.s9, data0.sA,
+                             data1.sA);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
+                             data2.sB, data0.sC, data1.sC,
+                             data2.sC, data0.sD, data1.sD,
+                             data2.sD, data0.sE, data1.sE,
+                             data2.sE, data0.sF, data1.sF,
+                             data2.sF);
+    vstore16(out2, 0, dst.ptr + 32);
+}
+
+/** This function combines three planes to a single RGBA image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
+ * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
+ * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
+ * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGBA8888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(plane3),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+    uchar16 data3 = vload16(0, plane3.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
+                             data0.s1, data1.s1, data2.s1, data3.s1,
+                             data0.s2, data1.s2, data2.s2, data3.s2,
+                             data0.s3, data1.s3, data2.s3, data3.s3);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
+                             data0.s5, data1.s5, data2.s5, data3.s5,
+                             data0.s6, data1.s6, data2.s6, data3.s6,
+                             data0.s7, data1.s7, data2.s7, data3.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
+                             data0.s9, data1.s9, data2.s9, data3.s9,
+                             data0.sA, data1.sA, data2.sA, data3.sA,
+                             data0.sB, data1.sB, data2.sB, data3.sB);
+    vstore16(out2, 0, dst.ptr + 32);
+
+    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
+                             data0.sD, data1.sD, data2.sD, data3.sD,
+                             data0.sE, data1.sE, data2.sE, data3.sE,
+                             data0.sF, data1.sF, data2.sF, data3.sF);
+    vstore16(out3, 0, dst.ptr + 48);
+}
+
+/** This function combines three planes to a single YUYV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_YUYV422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
+                             data0.s2, data1.s1, data0.s3, data2.s1,
+                             data0.s4, data1.s2, data0.s5, data2.s2,
+                             data0.s6, data1.s3, data0.s7, data2.s3);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
+                             data0.sA, data1.s5, data0.sB, data2.s5,
+                             data0.sC, data1.s6, data0.sD, data2.s6,
+                             data0.sE, data1.s7, data0.sF, data2.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single UYUV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_UYVY422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
+                             data1.s1, data0.s2, data2.s1, data0.s3,
+                             data1.s2, data0.s4, data2.s2, data0.s5,
+                             data1.s3, data0.s6, data2.s3, data0.s7);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
+                             data1.s5, data0.sA, data2.s5, data0.sB,
+                             data1.s6, data0.sC, data2.s6, data0.sD,
+                             data1.s7, data0.sE, data2.s7, data0.sF);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single NV12/NV21 image.
+ *
+ * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void channel_combine_NV(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+
+    // Create UV place
+    uchar8 data1 = vload8(0, src_plane1.ptr);
+    uchar8 data2 = vload8(0, src_plane2.ptr);
+
+#if defined NV12
+    vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#elif defined NV21
+    vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#endif
+}
+
+/** This function combines three planes to a single YUV444 or IYUV image.
+ *
+ * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void copy_planes_3p(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    IMAGE_DECLARATION(dst_plane2),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+#if defined YUV444
+    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#elif defined IYUV
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
new file mode 100644
index 0000000000..14c6c8a92a
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_extract.cl
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function extracts a given channel from an RGB image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGB888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar8  data2 = vload8(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore4(data.s0369, 0, dst.ptr);
+    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
+#elif defined CHANNEL_G
+    vstore4(data.s147A, 0, dst.ptr);
+    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
+#elif defined CHANNEL_B
+    vstore4(data.s258B, 0, dst.ptr);
+    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
+#endif
+}
+
+/** This function extracts a given channel from an RGBA image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGBA8888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar16 data2 = vload16(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
+#elif defined CHANNEL_G
+    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
+#elif defined CHANNEL_B
+    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
+#elif defined CHANNEL_A
+    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an YUYV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_YUYV422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s159D, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s37BF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an UYUV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_UYVY422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s048C, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s26AE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV12 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV12(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV21 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV21(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given plane from an multi-planar image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void copy_plane(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Copy plane data
+    vstore16(vload16(0, src.ptr), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
new file mode 100644
index 0000000000..f5ec85ae76
--- /dev/null
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -0,0 +1,1823 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    uchar16 rgb_0 = vload16(0, in.ptr);
+    uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
+    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
+    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+    vstore16(rgba_2, 0, out.ptr + 32);
+    vstore16(rgba_3, 0, out.ptr + 48);
+}
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGBA8888_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+    // handle 16 pixels every time
+    uchar16 rgba_0 = vload16(0, in.ptr);
+    uchar16 rgba_1 = vload16(0, in.ptr + 16);
+    uchar16 rgba_2 = vload16(0, in.ptr + 32);
+    uchar16 rgba_3 = vload16(0, in.ptr + 48);
+
+    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
+    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
+    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore16(rgb_1, 0, out.ptr + 16);
+    vstore16(rgb_2, 0, out.ptr + 32);
+}
+
+/** Convert a UYVY422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a UYVY422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a RGB image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGB888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+*/
+
+/** Convert an NV12 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert a RGB image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4_rtz(f_y);
+    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
+    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGB image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert a RGBA image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4(f_y);
+    short4 i_u = convert_short4(f_u) + (short4)(128);
+    short4 i_v = convert_short4(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGBA image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGBA8888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a RGBA image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert an NV12 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an NV21 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert a UYVY image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void UYVY422_to_IYUV_bt709(
+    IMAGE_DECLARATION(uyvy_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 uyvy = vload16(0, in_uyvy.ptr);
+    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
+    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert a YUYV image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void YUYV422_to_IYUV_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv = vload16(0, in_yuyv.ptr);
+    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert an IYUV image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void IYUV_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb_src = vload8(0, in_u.ptr);
+    uchar8  cr_src = vload8(0, in_v.ptr);
+    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
+                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
+    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
+                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an IYUV image to NV12
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void IYUV_to_NV12_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb     = vload8(0, in_u.ptr);
+    uchar8  cr     = vload8(0, in_v.ptr);
+    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
+                               cr.s6, cb.s7, cr.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a YUYV image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void YUYV422_to_NV12_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
+    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a UYVY image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void UYVY422_to_NV12_bt709(
+    IMAGE_DECLARATION(input_uyvy),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 16 pixels every time, each line 8 pixels
+    const uchar16 uyvy_t = vload16(0, in.ptr);
+    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
+
+    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
+    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
+
+    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
+    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
+    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
new file mode 100644
index 0000000000..00f5189508
--- /dev/null
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_depth(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    unsigned int offset)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    float4 source_values = vload4(0, (__global float *)src.ptr);
+
+    vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+}
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
new file mode 100644
index 0000000000..3733d0c733
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution3x3.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
+                                                  const short left_coeff,
+                                                  const short middle_coeff,
+                                                  const short right_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src   A pointer to source Image structure
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2,
+    const short mat3, const short mat4, const short mat5,
+    const short mat6, const short mat7, const short mat8, uint scale)
+{
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    // Row 0
+    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
+    // Row
+    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
+    // Row 2
+    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
+
+    // Divide by the scale
+    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
+ * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution3x3_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution3x3(&src,
+                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
new file mode 100644
index 0000000000..d1335c5558
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution5x5.cl
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x5(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
+           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    return out;
+}
+
+/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
+ * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
+ * [ mat10, mat11, mat12, mat13, mat14 ]\n
+ * [ mat15, mat16, mat17, mat18, mat19 ]\n
+ * [ mat20, mat21, mat22, mat23, mat24 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+short8 convolution5x5(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
+    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
+    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
+    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
+    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable5x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution5x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution5x5(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
new file mode 100644
index 0000000000..74a0055370
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution7x7.cl
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x7(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
+            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution7x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    return out;
+}
+
+/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
+ * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
+ * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
+ * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
+ * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
+ * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
+ * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution7x7(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
+    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
+    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
+    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
+    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
+    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
+    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable7x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution7x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution7x7(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
+
+    // Clamp results to [ 0, 255 ] and store them in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
new file mode 100644
index 0000000000..d8b07cafac
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution9x9.cl
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the third left pixel
+ * @param[in] left4_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the third right pixel
+ * @param[in] right4_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x9(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           left4_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff,
+    const short           right4_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
+            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
+                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the third up pixel
+ * @param[in] up4_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ * @param[in] down4_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution9x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short up4_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff,
+    const short down4_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
+
+    return out;
+}
+
+/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
+ * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
+ * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
+ * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
+ * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
+ * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
+ * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
+ * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
+ * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] mat49 Coefficient from the convolution matrix
+ * @param[in] mat50 Coefficient from the convolution matrix
+ * @param[in] mat51 Coefficient from the convolution matrix
+ * @param[in] mat52 Coefficient from the convolution matrix
+ * @param[in] mat53 Coefficient from the convolution matrix
+ * @param[in] mat54 Coefficient from the convolution matrix
+ * @param[in] mat55 Coefficient from the convolution matrix
+ * @param[in] mat56 Coefficient from the convolution matrix
+ * @param[in] mat57 Coefficient from the convolution matrix
+ * @param[in] mat58 Coefficient from the convolution matrix
+ * @param[in] mat59 Coefficient from the convolution matrix
+ * @param[in] mat60 Coefficient from the convolution matrix
+ * @param[in] mat61 Coefficient from the convolution matrix
+ * @param[in] mat62 Coefficient from the convolution matrix
+ * @param[in] mat63 Coefficient from the convolution matrix
+ * @param[in] mat64 Coefficient from the convolution matrix
+ * @param[in] mat65 Coefficient from the convolution matrix
+ * @param[in] mat66 Coefficient from the convolution matrix
+ * @param[in] mat67 Coefficient from the convolution matrix
+ * @param[in] mat68 Coefficient from the convolution matrix
+ * @param[in] mat69 Coefficient from the convolution matrix
+ * @param[in] mat70 Coefficient from the convolution matrix
+ * @param[in] mat71 Coefficient from the convolution matrix
+ * @param[in] mat72 Coefficient from the convolution matrix
+ * @param[in] mat73 Coefficient from the convolution matrix
+ * @param[in] mat74 Coefficient from the convolution matrix
+ * @param[in] mat75 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat77 Coefficient from the convolution matrix
+ * @param[in] mat78 Coefficient from the convolution matrix
+ * @param[in] mat79 Coefficient from the convolution matrix
+ * @param[in] mat80 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution9x9(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
+    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
+    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
+    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
+    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
+    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
+    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
+    const short mat80, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
+    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
+    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
+    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
+    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
+    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
+    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
+    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
+    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable9x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
+
+    // Divide by the scale
+    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution9x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution9x9(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
+                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
+                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
+                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
new file mode 100644
index 0000000000..bd5dfaff68
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as input
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as input
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#if defined HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif
+    uint width, uint height, uint depth, uint total_filters)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#if defined         HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif
+
+    if(is_last_thread)
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#if defined HAS_BIAS
+            *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+            tmp_bias_ptr += bias_stride_x;
+#endif
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+    else
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+}
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  kernel_size                       The convolution kernel size
+ * @param[in]  kernel_depth                      The kernel depth
+ * @param[in]  width                             The output tensor width
+ * @param[in]  input_dims                        The input tensor dimensions
+ * @param[in]  strides                           The strides of the im2col operation
+ * @param[in]  paddings                          The input tensor paddings
+ */
+__kernel void im2col_generic(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    int  kernel_size,
+    int  kernel_depth,
+    int  width,
+    int2 input_dims,
+    int2 strides,
+    int2 paddings)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst);
+
+    // Determine output index
+    uint     idx               = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y;
+    __global uchar *output_ptr = dst.ptr + idx;
+
+    // Determine current input index
+    const int top_left_x = get_global_id(0) * strides.x - paddings.x;
+    const int top_left_y = get_global_id(1) * strides.y - paddings.y;
+
+    // Linearize convolution elements
+    for(int d = 0; d < kernel_depth; ++d)
+    {
+        for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y)
+        {
+            for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x)
+            {
+                if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y)
+                {
+                    *((__global DATA_TYPE *)output_ptr) = 0;
+                }
+                else
+                {
+                    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d)));
+                }
+            }
+        }
+    }
+
+#if defined HAS_BIAS
+    *((__global DATA_TYPE *)output_ptr) = 1;
+#endif
+}
+
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The output tensor width
+ */
+__kernel void col2im(
+    IMAGE_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint width)
+{
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+    int      idx                         = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x;
+    __global uchar *tmp_out_ptr          = dst.ptr + idx;
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr));
+}
+
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as input.
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+__kernel void im2col_reduced(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst),
+    uint width, uint height)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const uint image_size = width * height;
+
+    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+
+#if defined HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+    {
+        tmp_out_ptr += dst_stride_x;
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
new file mode 100644
index 0000000000..96b9cff3eb
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "convolution3x3.cl"
+#include "convolution5x5.cl"
+#include "convolution7x7.cl"
+#include "convolution9x9.cl"
+#include "helpers.h"
+
+#define MAT_INDEX(i) MAT##i
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_rectangle(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short matrix_coeff[81] =
+    {
+        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
+        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
+        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
+        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
+        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
+        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
+        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
+        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
+        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
+    };
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
+
+    for(int i = 0; i < MATRIX_HEIGHT; i++)
+    {
+#if MATRIX_WIDTH == 3
+        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
+                                 matrix_coeff[2 + i * 3]);
+#endif
+
+#if MATRIX_WIDTH == 5
+        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
+                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
+#endif
+
+#if MATRIX_WIDTH == 7
+        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
+                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
+                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
+#endif
+
+#if MATRIX_WIDTH == 9
+        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
+                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
+                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
+#endif
+    }
+
+    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
new file mode 100644
index 0000000000..c8eaa95352
--- /dev/null
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#else
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif
+
+/** This function performs a down-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_down(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
+    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+    vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function performs a up-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_up(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
new file mode 100644
index 0000000000..0e810d2e7c
--- /dev/null
+++ b/src/core/CL/cl_kernels/derivative.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel that computes the first-order derivative.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void derivative(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
+    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
+    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
+    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
+    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
new file mode 100644
index 0000000000..c62c701757
--- /dev/null
+++ b/src/core/CL/cl_kernels/dilate.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function dilates an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void dilate(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
new file mode 100644
index 0000000000..6576f1827f
--- /dev/null
+++ b/src/core/CL/cl_kernels/erode.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function erodes an input image image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void erode(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
new file mode 100644
index 0000000000..470d14a7b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/fast_corners.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+ *
+ *      . . F 0 1 . . .
+ *      . E . . . 2 . .
+ *      D . . . . . 3 .
+ *      C . . P . . 4 .
+ *      B . . . . . 5 .
+ *      . A . . . 6 . .
+ *      . . 9 8 7 . . .
+ */
+constant int offsets_s[16][2] =
+{
+    { 0, -3 },  // 0
+    { 1, -3 },  // 1
+    { 2, -2 },  // 2
+    { 3, -1 },  // 3
+    { 3, 0 },   // 4
+    { 3, 1 },   // 5
+    { 2, 2 },   // 6
+    { 1, 3 },   // 7
+    { 0, 3 },   // 8
+    { -1, 3 },  // 9
+    { -2, 2 },  // A
+    { -3, 1 },  // B
+    { -3, 0 },  // C
+    { -3, -1 }, // D
+    { -2, -2 }, // E
+    { -1, -3 }, // F
+};
+
+/** Load a pixel and set the mask values.
+ *
+ * @param[in]  ptr         The pointer to the starting address of source image
+ * @param[in]  a           Index to indicate the position in the Bresenham circle
+ * @param[in]  stride      Stride of source image in x dimension
+ * @param[in]  dark        The left end of the threshold range
+ * @param[in]  bright      The right end of the threshold range
+ * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
+ * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
+ *
+ */
+#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
+    {                                                                           \
+        unsigned char pixel;                                                    \
+        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
+        dark_mask |= (pixel < dark) << a;                                       \
+        bright_mask |= (pixel > bright) << a;                                   \
+    }
+
+/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
+ *
+ * @param[in]  bright_mask The mask recording postions of bright pixels
+ * @param[in]  dark_mask   The mask recording postions of dark pixels
+ * @param[out] isCorner    Indicate whether candidate pixel is corner
+ */
+#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
+    {                                                     \
+        for(int i = 0; i < 16; i++)                       \
+        {                                                 \
+            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
+            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
+            if(isCorner)                                  \
+            {                                             \
+                break;                                    \
+            }                                             \
+            bright_mask >>= 1;                            \
+            dark_mask >>= 1;                              \
+        }                                                 \
+    }
+
+/* Calculate pixel's strength */
+uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
+{
+    short a = threshold;
+    short b = 255;
+    while(b - a > 1)
+    {
+        uchar        c           = convert_uchar_sat((a + b) / 2);
+        unsigned int bright_mask = 0;
+        unsigned int dark_mask   = 0;
+
+        unsigned char p_bright = add_sat(candidate_pixel, c);
+        unsigned char p_dark   = sub_sat(candidate_pixel, c);
+
+        bool isCorner = 0;
+
+        for(uint i = 0; i < 16; i++)
+        {
+            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
+        }
+
+        bright_mask |= (bright_mask << 16);
+        dark_mask |= (dark_mask << 16);
+        CHECK_CORNER(bright_mask, dark_mask, isCorner);
+
+        if(isCorner)
+        {
+            a = convert_short(c);
+        }
+        else
+        {
+            b = convert_short(c);
+        }
+    }
+    return a;
+}
+
+/** Fast corners implementation. Calculates and returns the strength of each pixel.
+ *
+ * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
+ * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
+ * from the LSB are set.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  threshold_value                      Threshold value.
+ *
+ */
+__kernel void fast_corners(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output),
+    float threshold_value)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    const unsigned char threshold = (uchar)threshold_value;
+
+    unsigned int bright_mask = 0;
+    unsigned int dark_mask   = 0;
+
+    unsigned char isCorner = 0;
+
+    unsigned char p        = *in.ptr;
+    unsigned char p_bright = add_sat(p, threshold);
+    unsigned char p_dark   = sub_sat(p, threshold);
+
+    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    if(((bright_mask | dark_mask) & 0x1111) == 0)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    bright_mask |= (bright_mask << 16);
+    dark_mask |= (dark_mask << 16);
+
+    CHECK_CORNER(bright_mask, dark_mask, isCorner)
+
+    if(!isCorner)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+#ifndef USE_MAXSUPPRESSION
+    *out.ptr = 1;
+#else
+
+    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
+#endif
+}
+
+/** Copy result to Keypoint buffer and count number of corners
+ *
+ * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
+ * @param[out] offset                              The number of skipped pixels in x dimension
+ * @param[out] num_of_points                       Number of points found
+ * @param[out] out                                 The keypoints found
+ *
+ */
+__kernel void copy_to_keypoint(
+    IMAGE_DECLARATION(input),
+    uint     max_num_points,
+    uint     offset,
+    __global uint *num_of_points,
+    __global Keypoint *out)
+{
+#ifndef UPDATE_NUMBER
+    if(*num_of_points >= max_num_points)
+    {
+        return;
+    }
+#endif
+
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uchar value = *in.ptr;
+
+    if(value > 0)
+    {
+        int id = atomic_inc(num_of_points);
+        if(id < max_num_points)
+        {
+            out[id].strength        = value;
+            out[id].x               = get_global_id(0) + offset;
+            out[id].y               = get_global_id(1) + offset;
+            out[id].tracking_status = 1;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
new file mode 100644
index 0000000000..df635869b1
--- /dev/null
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    IMAGE_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    IMAGE_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
new file mode 100644
index 0000000000..618937f36d
--- /dev/null
+++ b/src/core/CL/cl_kernels/gaussian_pyramid.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian1x5_sub_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values for the convolution (20 bytes needed)
+    uchar16 temp0 = vload16(0, src.ptr);
+    uchar4  temp1 = vload4(0, src.ptr + 16);
+
+    // Convert to USHORT8
+    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
+    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
+    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
+    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
+    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
+
+    // Compute convolution along the X direction
+    ushort8 pixels = l2_data + r2_data;
+    pixels += l1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += r1_data * (ushort8)4;
+
+    // Store result
+    vstore8(pixels, 0, (__global ushort *)dst.ptr);
+}
+
+/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian5x1_sub_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
+    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
+    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
+    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
+    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
+
+    // Compute convolution along the Y direction
+    ushort8 pixels = u2_data + d2_data;
+    pixels += u1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += d1_data * (ushort8)4;
+
+    // Scale result
+    pixels >>= (ushort8)8;
+
+    // Store result
+    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 0000000000..caf6e3ffd8
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,1099 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    float4 b0 = vload4(0, (__global float *)src.ptr);
+
+    vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    half8 b0 = vload8(0, (__global half *)src.ptr);
+
+    vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
+
+    vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U32/S32/F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
+    float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
+    float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
+    float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
+
+    float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
+
+    val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
+
+    val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
+
+    val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U16/S16/F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
+    half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
+    half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
+    half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
+
+    half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
+
+    val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
+
+    val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
+
+    val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8/S8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
+    uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
+    uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
+    uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
+
+    uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
+                             (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
+
+    val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
+                     (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
+
+    val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
+                     (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
+
+    val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
+                     (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f32(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
+    float4 biases_value = vload4(0, (__global float *)biases.ptr);
+    accum_value         = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore4(accum_value, 0, (__global float *)accum.ptr);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f16(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
+    half8 biases_value = vload8(0, (__global half *)biases.ptr);
+    accum_value        = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore8(accum_value, 0, (__global half *)accum.ptr);
+}
+
+#if(defined WIDTH_MATRIX_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: U8
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  a_offset                           Offset to be added to each element of the matrix A
+ * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
+ * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
+ * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
+ * @param[in]  shift                              Number of bits to shift right the result.
+ */
+__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
+                         IMAGE_DECLARATION(src1),
+                         IMAGE_DECLARATION(dst),
+                         int a_offset,
+                         int b_offset,
+                         int c_offset,
+                         int c_mult_int,
+                         int shift)
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    int16 c00 = 0.0f;
+    int16 c10 = 0.0f;
+    int16 c20 = 0.0f;
+    int16 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+
+        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+        c00 += (int16)a0.s4 * b1;
+        c10 += (int16)a0.s5 * b1;
+        c20 += (int16)a0.s6 * b1;
+        c30 += (int16)a0.s7 * b1;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
+    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
+    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
+    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
+
+    /* Store 4x16 block */
+    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
+    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
+    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
+    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
+}
+#endif
+
+#if(defined WIDTH_MATRIX_B && defined ALPHA)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 4 in order to get the src_addr in unit of float */
+    src_addr = src_addr >> 2;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    float4 c00 = 0.0f;
+    float4 c10 = 0.0f;
+    float4 c20 = 0.0f;
+    float4 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (float4)ALPHA;
+    c10 = c10 * (float4)ALPHA;
+    c20 = c20 * (float4)ALPHA;
+    c30 = c30 * (float4)ALPHA;
+
+    /* Store 4x4 block */
+    vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+
+    // Reset accumulators
+    float c00 = 0.0f;
+    float c01 = 0.0f;
+    float c02 = 0.0f;
+    float c03 = 0.0f;
+    float c10 = 0.0f;
+    float c11 = 0.0f;
+    float c12 = 0.0f;
+    float c13 = 0.0f;
+    float c20 = 0.0f;
+    float c21 = 0.0f;
+    float c22 = 0.0f;
+    float c23 = 0.0f;
+    float c30 = 0.0f;
+    float c31 = 0.0f;
+    float c32 = 0.0f;
+    float c33 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4);
+        b0 = vload4(0, src_addr_b + 4);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 8);
+        b0 = vload4(0, src_addr_b + 8);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 12);
+        b0 = vload4(0, src_addr_b + 12);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product
+    c00 = c00 * ALPHA;
+    c01 = c01 * ALPHA;
+    c02 = c02 * ALPHA;
+    c03 = c03 * ALPHA;
+    c10 = c10 * ALPHA;
+    c11 = c11 * ALPHA;
+    c12 = c12 * ALPHA;
+    c13 = c13 * ALPHA;
+    c20 = c20 * ALPHA;
+    c21 = c21 * ALPHA;
+    c22 = c22 * ALPHA;
+    c23 = c23 * ALPHA;
+    c30 = c30 * ALPHA;
+    c31 = c31 * ALPHA;
+    c32 = c32 * ALPHA;
+    c33 = c33 * ALPHA;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    // Store 4x4 block
+    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 2 in order to get the src_addr in unit of half */
+    src_addr = src_addr >> 1;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    half8 c00 = 0.0f;
+    half8 c10 = 0.0f;
+    half8 c20 = 0.0f;
+    half8 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (half8)ALPHA;
+    c10 = c10 * (half8)ALPHA;
+    c20 = c20 * (half8)ALPHA;
+    c30 = c30 * (half8)ALPHA;
+
+    /* Store 4x8 block */
+    vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
+    vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
+    vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
+    vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
+}
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (float4)ALPHA;
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 8;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(half);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
+
+    half8 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
+    {
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+        half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
+        half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+
+        acc += b0 * (half8)a0.s0;
+        acc += b1 * (half8)a0.s1;
+        acc += b2 * (half8)a0.s2;
+        acc += b3 * (half8)a0.s3;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
+    {
+        half a0  = *((__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (half8)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (half8)ALPHA;
+
+    vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
+#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+
+#if(defined BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    /* Load values from Matrix C */
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    /* Load values from Matrix C */
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif /* (defined BETA) */
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @attention The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+                             TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+    int idy = get_global_id(1);
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
new file mode 100644
index 0000000000..5320a064ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/harris_corners.cl
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Function running harris score on 3x3 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_3x3(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float4 gx2  = (float4)0.0f;
+    float4 gy2  = (float4)0.0f;
+    float4 gxgy = (float4)0.0f;
+
+    /* Row0 */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
+
+    float4 l_gx = convert_float4(temp_gx.s0123);
+    float4 m_gx = convert_float4(temp_gx.s1234);
+    float4 r_gx = convert_float4(temp_gx.s2345);
+
+    float4 l_gy = convert_float4(temp_gy.s0123);
+    float4 m_gy = convert_float4(temp_gy.s1234);
+    float4 r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row1 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row2 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x5.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
+
+    float4 gx = convert_float4(temp_gx.s0123);
+    float4 gy = convert_float4(temp_gy.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s1234);
+    gy = convert_float4(temp_gy.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s2345);
+    gy = convert_float4(temp_gy.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s3456);
+    gy = convert_float4(temp_gy.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s4567);
+    gy = convert_float4(temp_gy.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 5x5 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_5x5(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -2; i < 3; i++)
+    {
+        res += harris_score_1x5(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x7.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
+
+    float4 gx = convert_float4(temp_gx0.s0123);
+    float4 gy = convert_float4(temp_gy0.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s1234);
+    gy = convert_float4(temp_gy0.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s2345);
+    gy = convert_float4(temp_gy0.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s3456);
+    gy = convert_float4(temp_gy0.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s4567);
+    gy = convert_float4(temp_gy0.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 7x7 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_7x7(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -3; i < 4; i++)
+    {
+        res += harris_score_1x7(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 0000000000..6db8ed567c
--- /dev/null
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+__global inline const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+__global inline uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+#endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
new file mode 100644
index 0000000000..a652b28e6a
--- /dev/null
+++ b/src/core/CL/cl_kernels/histogram.cl
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VATOMIC_INC16(histogram, win_pos)   \
+    {                                       \
+        atomic_inc(histogram + win_pos.s0); \
+        atomic_inc(histogram + win_pos.s1); \
+        atomic_inc(histogram + win_pos.s2); \
+        atomic_inc(histogram + win_pos.s3); \
+        atomic_inc(histogram + win_pos.s4); \
+        atomic_inc(histogram + win_pos.s5); \
+        atomic_inc(histogram + win_pos.s6); \
+        atomic_inc(histogram + win_pos.s7); \
+        atomic_inc(histogram + win_pos.s8); \
+        atomic_inc(histogram + win_pos.s9); \
+        atomic_inc(histogram + win_pos.sa); \
+        atomic_inc(histogram + win_pos.sb); \
+        atomic_inc(histogram + win_pos.sc); \
+        atomic_inc(histogram + win_pos.sd); \
+        atomic_inc(histogram + win_pos.se); \
+        atomic_inc(histogram + win_pos.sf); \
+    }
+
+/** Calculate the histogram of an 8 bit grayscale image.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
+                                __local uint *histogram_local,
+                                __global uint *restrict histogram,
+                                uint                    num_bins,
+                                uint                    offset,
+                                uint                    range,
+                                uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    uint  local_id_x   = get_local_id(0);
+
+    uint local_x_size = get_local_size(0);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            histogram_local[i] = 0;
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            histogram_local[local_id_x] = 0;
+        }
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    VATOMIC_INC16(histogram_local, win_pos);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            atomic_add(histogram + i, histogram_local[i]);
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
+        }
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image's border.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
+                                 __global uint *restrict histogram,
+                                 uint                    num_bins,
+                                 uint                    offset,
+                                 uint                    range,
+                                 uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint val = (uint)(*input_buffer.ptr);
+
+    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
+
+    if(val >= offset && (val < offrange))
+    {
+        atomic_inc(histogram + win_pos);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256 elements
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
+                                      __local uint *histogram_local,
+                                      __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint local_index  = get_local_id(0);
+    uint local_x_size = get_local_size(0);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        histogram_local[i] = 0;
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    atomic_inc(histogram_local + vals.s0);
+    atomic_inc(histogram_local + vals.s1);
+    atomic_inc(histogram_local + vals.s2);
+    atomic_inc(histogram_local + vals.s3);
+    atomic_inc(histogram_local + vals.s4);
+    atomic_inc(histogram_local + vals.s5);
+    atomic_inc(histogram_local + vals.s6);
+    atomic_inc(histogram_local + vals.s7);
+    atomic_inc(histogram_local + vals.s8);
+    atomic_inc(histogram_local + vals.s9);
+    atomic_inc(histogram_local + vals.sa);
+    atomic_inc(histogram_local + vals.sb);
+    atomic_inc(histogram_local + vals.sc);
+    atomic_inc(histogram_local + vals.sd);
+    atomic_inc(histogram_local + vals.se);
+    atomic_inc(histogram_local + vals.sf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        atomic_add(histogram + i, histogram_local[i]);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
+                                       __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    atomic_inc(histogram + *input_buffer.ptr);
+}
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
new file mode 100644
index 0000000000..31dd57b767
--- /dev/null
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
+ * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
+ * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
+ * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+                                      IMAGE_DECLARATION(phase),
+                                      IMAGE_DECLARATION(dst))
+{
+    float bins[NUM_BINS] = { 0 };
+
+    // Compute address for the magnitude and phase images
+    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+    __global uchar *mag_row_ptr   = mag.ptr;
+    __global uchar *phase_row_ptr = phase.ptr;
+
+    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+    {
+        int xc = 0;
+        for(; xc <= (CELL_WIDTH - 4); xc += 4)
+        {
+            // Load magnitude and phase values
+            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+            // Compute histogram index.
+            int4 hidx_s32 = convert_int4(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float4 hidx_f32 = convert_float4(hidx_s32);
+
+            // w1 = phase_f32 - hidx_s32
+            const float4 w1_f32 = phase_f32 - hidx_f32;
+
+            // w0 = 1.0 - w1
+            const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+            // Calculate the weights for splitting vote
+            const float4 mag_w0_f32 = mag_f32 * w0_f32;
+            const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin 0
+            bins[hidx_s32.s0] += mag_w0_f32.s0;
+            bins[hidx_s32.s1] += mag_w0_f32.s1;
+            bins[hidx_s32.s2] += mag_w0_f32.s2;
+            bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+            hidx_s32 += (int4)1;
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin1
+            bins[hidx_s32.s0] += mag_w1_f32.s0;
+            bins[hidx_s32.s1] += mag_w1_f32.s1;
+            bins[hidx_s32.s2] += mag_w1_f32.s2;
+            bins[hidx_s32.s3] += mag_w1_f32.s3;
+        }
+
+        // Left over computation
+        for(; xc < CELL_WIDTH; xc++)
+        {
+            const float mag_value   = *((__global short *)mag_row_ptr + xc);
+            const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float w1          = phase_value - floor(phase_value);
+
+            // The quantised phase is the histogram index [0, NUM_BINS - 1]
+            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+            const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+            // Weighted vote between 2 bins
+            bins[hidx] += mag_value * (1.0f - w1);
+            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+        }
+
+        // Point to the next row of magnitude and phase images
+        mag_row_ptr += mag_stride_y;
+        phase_row_ptr += phase_stride_y;
+    }
+
+    // Compute address for the destination image
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the local HOG in the global memory
+    int xc = 0;
+    for(; xc <= (NUM_BINS - 4); xc += 4)
+    {
+        float4 values = vload4(0, bins + xc);
+
+        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+    }
+
+    // Left over stores
+    for(; xc < NUM_BINS; ++xc)
+    {
+        ((__global float *)dst.ptr)[xc] = bins[xc];
+    }
+}
+#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    float  sum     = 0.0f;
+    float4 sum_f32 = (float4)(0.0f);
+
+    // Compute address for the source and destination tensor
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+    {
+        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+        int xc = 0;
+        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+        {
+            const float4 val0 = vload4(0, hist_ptr + xc + 0);
+            const float4 val1 = vload4(0, hist_ptr + xc + 4);
+            const float4 val2 = vload4(0, hist_ptr + xc + 8);
+            const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            // Compute val^2 for L2_NORM or L2HYS_NORM
+            sum_f32 += val0 * val0;
+            sum_f32 += val1 * val1;
+            sum_f32 += val2 * val2;
+            sum_f32 += val3 * val3;
+#else
+            // Compute |val| for L1_NORM
+            sum_f32 += fabs(val0);
+            sum_f32 += fabs(val1);
+            sum_f32 += fabs(val2);
+            sum_f32 += fabs(val3);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+            // will be accessed consecutively
+            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+        }
+
+        // Compute left over
+        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+        {
+            const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            sum += val * val;
+#else
+            sum += fabs(val);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+        }
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+    // Reset sum
+    sum_f32 = (float4)0.0f;
+    sum     = 0.0f;
+
+    int k = 0;
+    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+        // Scale val
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        // Clip val if over _threshold_l2hys
+        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+        // Compute val^2
+        sum_f32 += val0 * val0;
+        sum_f32 += val1 * val1;
+        sum_f32 += val2 * val2;
+        sum_f32 += val3 * val3;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+    }
+
+    // Compute left over
+    for(; k < NUM_BINS_PER_BLOCK; ++k)
+    {
+        float val = ((__global float *)dst.ptr)[k] * scale;
+
+        // Clip scaled input_value if over L2_HYST_THRESHOLD
+        val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+        sum += val * val;
+
+        ((__global float *)dst.ptr)[k] = val;
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    // We use the same constants of OpenCV
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+
+    int i = 0;
+    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+        // Multiply val by the normalization scale factor
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+    }
+
+    for(; i < NUM_BINS_PER_BLOCK; ++i)
+    {
+        ((__global float *)dst.ptr)[i] *= scale;
+    }
+}
+#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
+ * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst                               Pointer to DetectionWindow array
+ * @param[out] num_detection_windows             Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+                           __global float *hog_descriptor,
+                           __global DetectionWindow *dst,
+                           __global uint *num_detection_windows)
+{
+    // Check if the DetectionWindow array is full
+    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+    {
+        return;
+    }
+
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+    // Init score_f32 with 0
+    float4 score_f32 = (float4)0.0f;
+
+    // Init score with 0
+    float score = 0.0f;
+
+    __global float *src_row_ptr = (__global float *)src.ptr;
+
+    // Compute Linear SVM
+    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+    {
+        int xb = 0;
+
+        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+        {
+            // Load descriptor values
+            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+            // Multiply accumulate
+            score_f32 += a0_f32 * b0_f32;
+            score_f32 += a1_f32 * b1_f32;
+        }
+
+        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+        {
+            const float a = src_row_ptr[xb];
+            const float b = hog_descriptor[xb + offset_y];
+
+            score += a * b;
+        }
+    }
+
+    score += dot(score_f32, (float4)1.0f);
+
+    // Add the bias. The bias is located at the position (descriptor_size() - 1)
+    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+    if(score > (float)THRESHOLD)
+    {
+        int id = atomic_inc(num_detection_windows);
+        if(id < MAX_NUM_DETECTION_WINDOWS)
+        {
+            dst[id].x         = get_global_id(0) * BLOCK_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+            dst[id].width     = DETECTION_WINDOW_WIDTH;
+            dst[id].height    = DETECTION_WINDOW_HEIGHT;
+            dst[id].idx_class = IDX_CLASS;
+            dst[id].score     = score;
+        }
+    }
+}
+#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
new file mode 100644
index 0000000000..970e04e150
--- /dev/null
+++ b/src/core/CL/cl_kernels/integral_image.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void integral_horizontal(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint prev = 0;
+
+    for(uint j = 0; j < src_step_x; j += 16)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
+        res.s0 += prev;
+        res.s1 += res.s0;
+        res.s2 += res.s1;
+        res.s3 += res.s2;
+        res.s4 += res.s3;
+        res.s5 += res.s4;
+        res.s6 += res.s5;
+        res.s7 += res.s6;
+        res.s8 += res.s7;
+        res.s9 += res.s8;
+        res.sA += res.s9;
+        res.sB += res.sA;
+        res.sC += res.sB;
+        res.sD += res.sC;
+        res.sE += res.sD;
+        res.sF += res.sE;
+        prev = res.sF;
+        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
+    }
+}
+
+/** This function computes the vertical integral of the image.
+ *
+ * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
+ * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     height                            Image height.
+ */
+__kernel void integral_vertical(
+    IMAGE_DECLARATION(src),
+    uint height)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
+    for(uint j = 1; j < height; ++j)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
+        res += prev;
+        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
+        prev = res;
+    }
+}
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
new file mode 100644
index 0000000000..c4b0df8de9
--- /dev/null
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculates L1 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L1 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates L2 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L2 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
+{
+    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
+                       VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates unsigned phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
+ */
+inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+    return convert_uchar16(angle_deg_f32);
+}
+
+/** Calculates signed phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
+ */
+inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
+    arct         = select(arct, arct + 2, arct < 0.0f);
+
+    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & 0xFFu);
+}
+
+#if(1 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
+#elif(2 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
+#else
+#define MAGNITUDE_OP(x, y)
+#endif
+
+#if(1 == PHASE)
+#define PHASE_OP(x, y) phase_unsigned((x), (y))
+#elif(2 == PHASE)
+#define PHASE_OP(x, y) phase_signed((x), (y))
+#else
+#define PHASE_OP(x, y)
+#endif
+
+/** Calculate the magnitude and phase of given the gradients of an image.
+ *
+ * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
+ * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
+ *
+ * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
+ * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
+ * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
+ *
+ * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
+ * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
+ * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
+ * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
+ * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
+ * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
+ * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
+ * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
+ * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
+ * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * */
+__kernel void magnitude_phase(
+    IMAGE_DECLARATION(gx),
+    IMAGE_DECLARATION(gy)
+#ifdef MAGNITUDE
+    ,
+    IMAGE_DECLARATION(magnitude)
+#endif
+#ifdef PHASE
+    ,
+    IMAGE_DECLARATION(phase)
+#endif
+)
+{
+    // Get pixels pointer
+    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
+    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
+
+    // Calculate and store the results
+#ifdef MAGNITUDE
+    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
+    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
+#endif
+#ifdef PHASE
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
new file mode 100644
index 0000000000..50b8312548
--- /dev/null
+++ b/src/core/CL/cl_kernels/mean_stddev.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  height                            Height of the input image
+ * @param[out] global_sum                        Global sum of all elements
+ * @param[out] global_sum_sq                     Global sum of squares of all elements
+ */
+__kernel void mean_stddev_accumulate(
+    IMAGE_DECLARATION(src),
+    uint     height,
+    __global ulong *global_sum
+#if defined         STDDEV
+    ,
+    __global ulong *global_sum_sq
+#endif
+)
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8   tmp_sum = 0;
+#if defined STDDEV
+    uint8   tmp_sum_sq = 0;
+#endif
+    // Calculate partial sum
+    for(int i = 0; i < height; i++)
+    {
+        // Load data
+        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
+
+        tmp_sum += data;
+#if defined STDDEV
+        tmp_sum_sq += data * data;
+#endif
+    }
+    // Perform reduction
+    tmp_sum.s0123 += tmp_sum.s4567;
+    tmp_sum.s01 += tmp_sum.s23;
+    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
+
+#if defined STDDEV
+    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
+    tmp_sum_sq.s01 += tmp_sum_sq.s23;
+    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
+#endif
+}
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
new file mode 100644
index 0000000000..799b1e8c3b
--- /dev/null
+++ b/src/core/CL/cl_kernels/minmaxloc.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#ifndef DATA_TYPE_MIN
+#define DATA_TYPE_MIN 0x0
+#endif
+
+#ifndef DATA_TYPE_MAX
+#define DATA_TYPE_MAX 0xFF
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** This function identifies the min and maximum value of an input image.
+ *
+ * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
+ * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
+ * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[in]  width                             Input image width
+ */
+__kernel void minmax(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    uint          width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Initialize local minimum and local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_min = type_max;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_max = type_min;
+
+    // Calculate min/max of row
+    uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        local_min = min(data, local_min);
+        local_max = max(data, local_max);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    widx      = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+    local_max = max(local_max, select(type_min, data, widx));
+    local_min = min(local_min, select(type_max, data, widx));
+#endif
+
+    // Perform min/max reduction
+    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
+    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
+
+    local_min.s0123 = min(local_min.s0123, local_min.s4567);
+    local_max.s0123 = max(local_max.s0123, local_max.s4567);
+
+    local_min.s01 = min(local_min.s01, local_min.s23);
+    local_max.s01 = max(local_max.s01, local_max.s23);
+
+    local_min.s0 = min(local_min.s0, local_min.s1);
+    local_max.s0 = max(local_max.s0, local_max.s1);
+
+    // Update global min/max
+    atomic_min(&min_max[0], local_min.s0);
+    atomic_max(&min_max[1], local_max.s0);
+}
+
+/** This function counts the min and max occurrences in an image and tags their position.
+ *
+ * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
+ * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
+ * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
+ * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
+ * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
+ * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
+ */
+__kernel void minmaxloc(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    __global uint *min_max_count
+#if defined        LOCATE_MIN
+    ,
+    __global Coordinates2D *min_loc, uint max_min_loc_count
+#endif
+#if defined LOCATE_MAX
+    ,
+    __global Coordinates2D *max_loc, uint max_max_loc_count
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
+#if defined COUNT_MIN_MAX
+    if(value == min_max[0])
+    {
+        uint idx = atomic_inc(&min_max_count[0]);
+#if defined  LOCATE_MIN
+        if(idx < max_min_loc_count)
+        {
+            min_loc[idx].x = get_global_id(0);
+            min_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+    if(value == min_max[1])
+    {
+        uint idx = atomic_inc(&min_max_count[1]);
+#if defined  LOCATE_MAX
+        if(idx < max_max_loc_count)
+        {
+            max_loc[idx].x = get_global_id(0);
+            max_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
new file mode 100644
index 0000000000..f860c96bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+/** This function applies a non linear filter on a 3x3 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar8  top    = vload8(0, offset(&src, 0, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar8  bottom = vload8(0, offset(&src, 0, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_3(middle);
+    uchar8    out        = min(tmp_middle, min(top, bottom));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_3(middle);
+    uchar8  out        = max(tmp_middle, max(top, bottom));
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = middle.s01234567;
+    uchar8 p2  = middle.s12345678;
+    uchar8 p3  = middle.s23456789;
+    uchar8 p4  = bottom.s01234567;
+    uchar8 out = sort5(p0, p1, p2, p3, p4);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp        = max(top, max(middle, bottom));
+    uchar8  out        = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
new file mode 100644
index 0000000000..d9ae95fd2d
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 8 disks of diameter 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top.s01234567;
+    uchar8 p4  = top.s12345678;
+    uchar8 p5  = top.s23456789;
+    uchar8 p6  = top.s3456789A;
+    uchar8 p7  = top.s456789AB;
+    uchar8 p8  = middle.s01234567;
+    uchar8 p9  = middle.s12345678;
+    uchar8 p10 = middle.s23456789;
+    uchar8 p11 = middle.s3456789A;
+    uchar8 p12 = middle.s456789AB;
+    uchar8 p13 = bottom.s01234567;
+    uchar8 p14 = bottom.s12345678;
+    uchar8 p15 = bottom.s23456789;
+    uchar8 p16 = bottom.s3456789A;
+    uchar8 p17 = bottom.s456789AB;
+    uchar8 p18 = bottom2.s01234567;
+    uchar8 p19 = bottom2.s12345678;
+    uchar8 p20 = bottom2.s23456789;
+
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p4, p5);
+    SORT(p6, p7);
+    SORT(p8, p9);
+    SORT(p10, p11);
+    SORT(p12, p13);
+    SORT(p14, p15);
+    SORT(p16, p17);
+    SORT(p18, p19);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p4, p6);
+    SORT(p5, p7);
+    SORT(p8, p10);
+    SORT(p9, p11);
+    SORT(p12, p14);
+    SORT(p13, p15);
+    SORT(p16, p18);
+    SORT(p17, p19);
+    SORT(p1, p2);
+    SORT(p5, p6);
+    SORT(p0, p4);
+    SORT(p3, p7);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p8, p12);
+    SORT(p11, p15);
+    SORT(p17, p18);
+    SORT(p16, p20);
+    SORT(p1, p5);
+    SORT(p2, p6);
+    SORT(p9, p13);
+    SORT(p10, p14);
+    SORT(p0, p8);
+    SORT(p7, p15);
+    SORT(p17, p20);
+    SORT(p1, p4);
+    SORT(p3, p6);
+    SORT(p9, p12);
+    SORT(p11, p14);
+    SORT(p18, p20);
+    SORT(p0, p16);
+    SORT(p2, p4);
+    SORT(p3, p5);
+    SORT(p10, p12);
+    SORT(p11, p13);
+    SORT(p1, p9);
+    SORT(p6, p14);
+    SORT(p19, p20);
+    SORT(p3, p4);
+    SORT(p11, p12);
+    SORT(p1, p8);
+    SORT(p2, p10);
+    SORT(p5, p13);
+    SORT(p7, p14);
+    SORT(p3, p11);
+    SORT(p2, p8);
+    SORT(p4, p12);
+    SORT(p7, p13);
+    SORT(p1, p17);
+    SORT(p3, p10);
+    SORT(p5, p12);
+    SORT(p1, p16);
+    SORT(p2, p18);
+    SORT(p3, p9);
+    SORT(p6, p12);
+    SORT(p2, p16);
+    SORT(p3, p8);
+    SORT(p7, p12);
+    SORT(p5, p9);
+    SORT(p6, p10);
+    SORT(p4, p8);
+    SORT(p7, p11);
+    SORT(p3, p19);
+    SORT(p5, p8);
+    SORT(p7, p10);
+    SORT(p3, p18);
+    SORT(p4, p20);
+    SORT(p6, p8);
+    SORT(p7, p9);
+    SORT(p3, p17);
+    SORT(p5, p20);
+    SORT(p7, p8);
+    SORT(p3, p16);
+    SORT(p6, p20);
+    SORT(p5, p17);
+    SORT(p7, p20);
+    SORT(p4, p16);
+    SORT(p6, p18);
+    SORT(p5, p16);
+    SORT(p7, p19);
+    SORT(p7, p18);
+    SORT(p6, p16);
+    SORT(p7, p17);
+    SORT(p10, p18);
+    SORT(p7, p16);
+    SORT(p9, p17);
+    SORT(p8, p16);
+    SORT(p9, p16);
+    SORT(p10, p16);
+
+    return p10;
+}
+
+/** Sorting network to sort 8 boxes of size 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top2.s3456789A;
+    uchar8 p4  = top2.s456789AB;
+    uchar8 p5  = top.s01234567;
+    uchar8 p6  = top.s12345678;
+    uchar8 p7  = top.s23456789;
+    uchar8 p8  = top.s3456789A;
+    uchar8 p9  = top.s456789AB;
+    uchar8 p10 = middle.s01234567;
+    uchar8 p11 = middle.s12345678;
+    uchar8 p12 = middle.s23456789;
+    uchar8 p13 = middle.s3456789A;
+    uchar8 p14 = middle.s456789AB;
+    uchar8 p15 = bottom.s01234567;
+    uchar8 p16 = bottom.s12345678;
+    uchar8 p17 = bottom.s23456789;
+    uchar8 p18 = bottom.s3456789A;
+    uchar8 p19 = bottom.s456789AB;
+    uchar8 p20 = bottom2.s01234567;
+    uchar8 p21 = bottom2.s12345678;
+    uchar8 p22 = bottom2.s23456789;
+    uchar8 p23 = bottom2.s3456789A;
+    uchar8 p24 = bottom2.s456789AB;
+
+    SORT(p1, p2);
+    SORT(p0, p1);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p3, p4);
+    SORT(p4, p5);
+    SORT(p0, p3);
+    SORT(p2, p5);
+    SORT(p2, p3);
+    SORT(p1, p4);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p7, p8);
+    SORT(p6, p7);
+    SORT(p7, p8);
+    SORT(p10, p11);
+    SORT(p9, p10);
+    SORT(p10, p11);
+    SORT(p6, p9);
+    SORT(p8, p11);
+    SORT(p8, p9);
+    SORT(p7, p10);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p0, p6);
+    SORT(p4, p10);
+    SORT(p4, p6);
+    SORT(p2, p8);
+    SORT(p2, p4);
+    SORT(p6, p8);
+    SORT(p1, p7);
+    SORT(p5, p11);
+    SORT(p5, p7);
+    SORT(p3, p9);
+    SORT(p3, p5);
+    SORT(p7, p9);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p5, p6);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p12, p13);
+    SORT(p13, p14);
+    SORT(p16, p17);
+    SORT(p15, p16);
+    SORT(p16, p17);
+    SORT(p12, p15);
+    SORT(p14, p17);
+    SORT(p14, p15);
+    SORT(p13, p16);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p19, p20);
+    SORT(p18, p19);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p21, p23);
+    SORT(p22, p24);
+    SORT(p22, p23);
+    SORT(p18, p21);
+    SORT(p20, p23);
+    SORT(p20, p21);
+    SORT(p19, p22);
+    SORT(p22, p24);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p12, p18);
+    SORT(p16, p22);
+    SORT(p16, p18);
+    SORT(p14, p20);
+    SORT(p20, p24);
+    SORT(p14, p16);
+    SORT(p18, p20);
+    SORT(p22, p24);
+    SORT(p13, p19);
+    SORT(p17, p23);
+    SORT(p17, p19);
+    SORT(p15, p21);
+    SORT(p15, p17);
+    SORT(p19, p21);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p17, p18);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p0, p12);
+    SORT(p8, p20);
+    SORT(p8, p12);
+    SORT(p4, p16);
+    SORT(p16, p24);
+    SORT(p12, p16);
+    SORT(p2, p14);
+    SORT(p10, p22);
+    SORT(p10, p14);
+    SORT(p6, p18);
+    SORT(p6, p10);
+    SORT(p10, p12);
+    SORT(p1, p13);
+    SORT(p9, p21);
+    SORT(p9, p13);
+    SORT(p5, p17);
+    SORT(p13, p17);
+    SORT(p3, p15);
+    SORT(p11, p23);
+    SORT(p11, p15);
+    SORT(p7, p19);
+    SORT(p7, p11);
+    SORT(p11, p13);
+    SORT(p11, p12);
+    return p12;
+}
+
+/** This function applies a non linear filter on a 5x5 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+    uchar8    out = row_reduce_min_5(tmp);
+#elif defined MAX
+    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
+    uchar8  out = row_reduce_max_5(tmp);
+#elif defined MEDIAN
+    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, 0, -2));
+    uchar16 top     = vload16(0, offset(&src, 0, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, 0, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, 0, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_5(middle);
+    uchar8    out        = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567)));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_5(middle);
+    uchar8  out        = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567)));
+#elif defined MEDIAN
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top.s01234567;
+    uchar8 p2  = middle.s01234567;
+    uchar8 p3  = middle.s12345678;
+    uchar8 p4  = middle.s23456789;
+    uchar8 p5  = middle.s3456789A;
+    uchar8 p6  = middle.s456789AB;
+    uchar8 p7  = bottom.s01234567;
+    uchar8 p8  = bottom2.s01234567;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -1, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -1, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp_3     = min(top2, bottom2);
+    uchar16   tmp_5     = min(middle, min(top, bottom));
+    uchar8    tmp_3_red = row_reduce_min_3(tmp_3);
+    uchar8    tmp_5_red = row_reduce_min_5(tmp_5);
+    uchar8    out       = min(tmp_3_red, tmp_5_red);
+#elif defined MAX
+    uchar16 tmp_3      = max(top2, bottom2);
+    uchar16 tmp_5      = max(middle, max(top, bottom));
+    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
+    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
+    uchar8  out        = max(tmp_3_red, tmp_5_red);
+#elif defined MEDIAN
+    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
new file mode 100644
index 0000000000..77da2091b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
new file mode 100644
index 0000000000..0e388d7496
--- /dev/null
+++ b/src/core/CL/cl_kernels/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
new file mode 100644
index 0000000000..076b0d8909
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply cross map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    DATA_TYPE acc = 0;
+
+    const int num_of_slices = get_global_size(2);
+    const int current_slice = get_global_id(2);
+
+    const int left_slice  = max(current_slice - (int)radius, (int)0);
+    const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1));
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice);
+    }
+
+    const float normalized = pow(kappa + coeff * (float)acc, beta);
+
+    const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized;
+
+    *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE);
+}
+
+/** Apply in map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    acc_vec = 0;
+
+    const int current_pos = get_global_id(0) << 2;
+
+    const int left_pos  = max(current_pos - (int)radius, -3);
+    const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1));
+
+    for(int i = left_pos; i <= right_pos; i += 1)
+    {
+        acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0));
+    }
+
+    const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta);
+
+    const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized;
+
+    vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
new file mode 100644
index 0000000000..e1131d5573
--- /dev/null
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/*
+ *The criteria for lost tracking is that the spatial gradient matrix has:
+ * - Determinant less than DETERMINANT_THR
+ * - or minimum eigenvalue is smaller then EIGENVALUE_THR
+ *
+ * The thresholds for the determinant and the minimum eigenvalue is
+ * defined by the OpenVX spec
+ *
+ * Note: Also lost tracking happens when the point tracked coordinate is outside
+ * the image coordinates
+ *
+ * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
+ */
+
+/* Internal Lucas-Kanade Keypoint struct */
+typedef struct InternalKeypoint
+{
+    float x;               /**< The x coordinate. */
+    float y;               /**< The y coordinate. */
+    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float dummy;
+} InternalKeypoint;
+
+/** Threshold for the determinant. Used for lost tracking criteria */
+#define DETERMINANT_THR 1.0e-07f
+
+/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
+#define EIGENVALUE_THR 1.0e-04f
+
+/** Constants used for Lucas-Kanade Algorithm */
+#define W_BITS (14)
+#define FLT_SCALE (1.0f / (float)(1 << 20))
+#define D0 ((float)(1 << W_BITS))
+#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
+
+/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
+ *
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level(
+    __global float4 *old_points_internal,
+    __global float4 *new_points_internal,
+    const float      scale)
+{
+    int idx = get_global_id(0);
+
+    // Get old and new keypoints
+    float4 old_point = old_points_internal[idx];
+    float4 new_point = new_points_internal[idx];
+
+    // Scale accordingly with the pyramid_scale
+    old_point.xy *= (float2)(2.0f);
+    new_point.xy *= (float2)(2.0f);
+
+    old_points_internal[idx] = old_point;
+    new_points_internal[idx] = new_point;
+}
+
+/** Initializes the internal new points array when the level of pyramid is equal to max.
+ *
+ * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max(
+    __global Keypoint *old_points,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint old_point = old_points[idx];
+
+    // Get old keypoint to track
+    InternalKeypoint old_point_internal;
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = old_point_internal;
+}
+
+/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
+ *
+ * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale                Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max_initial_estimate(
+    __global Keypoint *old_points,
+    __global Keypoint *new_points_estimates,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint         old_point          = old_points[idx];
+    Keypoint         new_point_estimate = new_points_estimates[idx];
+    InternalKeypoint old_point_internal;
+    InternalKeypoint new_point_internal;
+
+    // Get old keypoint to track
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Get new keypoint to track
+    new_point_internal.x               = new_point_estimate.x * scale;
+    new_point_internal.y               = new_point_estimate.y * scale;
+    new_point_internal.tracking_status = new_point_estimate.tracking_status;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = new_point_internal;
+}
+
+/** Truncates the coordinates stored in new_points array
+ *
+ * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
+ * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
+ */
+__kernel void finalize(
+    __global InternalKeypoint *new_points_internal,
+    __global Keypoint *new_points)
+{
+    int idx = get_global_id(0);
+
+    // Load internal keypoint
+    InternalKeypoint new_point_internal = new_points_internal[idx];
+
+    // Calculate output point
+    Keypoint new_point;
+    new_point.x               = round(new_point_internal.x);
+    new_point.y               = round(new_point_internal.y);
+    new_point.tracking_status = new_point_internal.tracking_status;
+
+    // Store new point
+    new_points[idx] = new_point;
+}
+
+/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
+ *
+ * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
+ * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
+ * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
+ * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
+ * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
+ * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
+ * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
+ * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
+ * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
+ * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
+ * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
+ */
+void __kernel lktracker_stage0(
+    IMAGE_DECLARATION(old_image),
+    IMAGE_DECLARATION(old_scharr_gx),
+    IMAGE_DECLARATION(old_scharr_gy),
+    __global float4 *old_points,
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0)
+{
+    int idx = get_global_id(0);
+
+    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
+    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
+    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
+
+    // Get old keypoint
+    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
+
+    // Get the floor value
+    float2 iold_keypoint = floor(old_keypoint);
+
+    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0.0f;
+        }
+
+        // Not valid coordinate. It sets min_eig to 0.0f
+        coeff[idx].s3 = 0.0f;
+
+        return;
+    }
+
+    // Compute weight for the bilinear interpolation
+    float2 ab = old_keypoint - iold_keypoint;
+
+    // Weight used for Bilinear-Interpolation on Scharr images
+    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
+    // w_scharr.s1 = ab.x * (1.0f - ab.y)
+    // w_scharr.s2 = (1.0f - ab.x) * ab.y
+    // w_scharr.s3 = ab.x * ab.y
+
+    float4 w_scharr;
+    w_scharr.s3  = ab.x * ab.y;
+    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
+    w_scharr.s12 = ab - (float2)w_scharr.s3;
+
+    // Weight used for Bilinear-Interpolation on Old and New images
+    // w.s0 = round(w_scharr.s0 * D0)
+    // w.s1 = round(w_scharr.s1 * D0)
+    // w.s2 = round(w_scharr.s2 * D0)
+    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+    float4 w;
+    w    = round(w_scharr * (float4)D0);
+    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    int4 iG = (int4)0;
+
+    // Window offset
+    int window_offset = idx * window_dimension_pow2;
+
+    // Compute Spatial Gradient Matrix G
+    for(ushort ky = 0; ky < window_dimension; ++ky)
+    {
+        int offset_y = iold_keypoint.y + ky;
+        for(ushort kx = 0; kx < window_dimension; ++kx)
+        {
+            int    offset_x = iold_keypoint.x + kx;
+            float4 px;
+
+            // Load values from old_image for computing the bilinear interpolation
+            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
+                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
+
+            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
+            float4 old_i;
+
+            // Compute bilinear interpolation (with D1 scale factor) for ival
+            old_i.s0 = dot(px, w) * D1;
+
+            // Load values from old_scharr_gx for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for ixval
+            old_i.s1 = dot(px, w_scharr);
+
+            // Load values from old_scharr_gy for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for iyval
+            old_i.s2 = dot(px, w_scharr);
+
+            // Rounding (it could be omitted. Used just for matching the VX implementation)
+            int4 iold = convert_int4(round(old_i));
+
+            // Accumulate values in the Spatial Gradient Matrix
+            iG.s0 += (int)(iold.s1 * iold.s1);
+            iG.s1 += (int)(iold.s1 * iold.s2);
+            iG.s2 += (int)(iold.s2 * iold.s2);
+
+            // Store ival, ixval and iyval
+            iold_val[window_offset + kx] = convert_short4(iold);
+        }
+        window_offset += window_dimension;
+    }
+
+    // Scale iA11, iA12 and iA22
+    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
+
+    // Compute minimum eigen value
+    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
+
+    // Store A11. A11, A22 and min_eig
+    coeff[idx] = G;
+}
+
+/** Computes the motion vector for a given keypoint
+ *
+ * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
+ * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
+ * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
+ * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
+ * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                             The half size of the window on which to perform the algorithm
+ * @param[in]      num_iterations                          The maximum number of iterations
+ * @param[in]      epsilon                                 The value for terminating the algorithm.
+ * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
+ * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
+ * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ */
+void __kernel lktracker_stage1(
+    IMAGE_DECLARATION(new_image),
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const int        num_iterations,
+    const float      epsilon,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0,
+    const int        term_iteration,
+    const int        term_epsilon)
+{
+    int   idx       = get_global_id(0);
+    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    float4 G = coeff[idx];
+
+    // Determinant
+    float D = G.s0 * G.s2 - G.s1 * G.s1;
+
+    // Check if it is a good point to track
+    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0;
+        }
+
+        return;
+    }
+
+    // Compute inverse
+    //D = native_recip(D);
+    D = 1.0 / D;
+
+    // Get new keypoint
+    float2 new_keypoint = new_points[idx].xy - (float)half_window;
+
+    // Get new point
+    float2 out_new_point = new_points[idx].xy;
+
+    // Keep delta obtained in the previous iteration
+    float2 prev_delta = (float2)0.0f;
+
+    int j = 0;
+    while(j < num_iterations)
+    {
+        // Get the floor value
+        float2 inew_keypoint = floor(new_keypoint);
+
+        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
+        {
+            if(level0 == 1)
+            {
+                // Invalidate tracked point as we are at level 0
+                new_points[idx].s2 = 0.0f;
+            }
+            else
+            {
+                new_points[idx].xy = out_new_point;
+            }
+
+            return;
+        }
+
+        // Compute weight for the bilinear interpolation
+        float2 ab = new_keypoint - inew_keypoint;
+
+        // Weight used for Bilinear-Interpolation on Old and New images
+        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
+        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
+        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
+        // w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+        float4 w;
+        w.s3  = ab.x * ab.y;
+        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
+        w.s12 = ab - (float2)w.s3;
+        w     = round(w * (float4)D0);
+        w.s3  = D0 - w.s0 - w.s1 - w.s2;
+
+        // Mismatch vector
+        int2 ib = 0;
+
+        // Old val offset
+        int old_val_offset = idx * window_dimension_pow2;
+
+        for(int ky = 0; ky < window_dimension; ++ky)
+        {
+            for(int kx = 0; kx < window_dimension; ++kx)
+            {
+                // ival, ixval and iyval have been computed in the previous stage
+                int4 old_ival = convert_int4(iold_val[old_val_offset]);
+
+                // Load values from old_image for computing the bilinear interpolation
+                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
+                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
+
+                // Compute bilinear interpolation on new image
+                int jval = (int)round(dot(px, w) * D1);
+
+                // Compute luminance difference
+                int diff = (int)(jval - old_ival.s0);
+
+                // Accumulate values in mismatch vector
+                ib += (diff * old_ival.s12);
+
+                // Update old val offset
+                old_val_offset++;
+            }
+        }
+
+        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
+
+        // Optical Flow
+        float2 delta;
+
+        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
+        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
+
+        // Update new point coordinate
+        new_keypoint += delta;
+
+        out_new_point = new_keypoint + (float2)half_window;
+
+        if(term_epsilon == 1)
+        {
+            float mag2 = dot(delta, delta);
+
+            if(mag2 <= epsilon)
+            {
+                new_points[idx].xy = out_new_point;
+
+                return;
+            }
+        }
+
+        // Check convergence analyzing the previous delta
+        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
+        {
+            out_new_point -= delta * (float2)0.5f;
+
+            new_points[idx].xy = out_new_point;
+
+            return;
+        }
+
+        // Update previous delta
+        prev_delta = delta;
+
+        if(term_iteration == 1)
+        {
+            j++;
+        }
+    }
+
+    new_points[idx].xy = out_new_point;
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..ae2031f422
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication
+#if defined DATA_TYPE_FLOAT
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif
+
+    // Store result
+    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..05c437cd17
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x))
+#else
+#define CONVERT_OP_INT_STR(x, type) (convert_##type(x))
+#endif
+#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32
+ */
+__kernel void pixelwise_mul_int(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const uint scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication and store result
+    vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
new file mode 100644
index 0000000000..1902df9b7d
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined POOL_AVG
+#define POOL_OP(x, y) ((x) + (y))
+#else
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif
+
+float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                          const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(0) * stride_x - pad_x;
+    int start_y = get_global_id(1) * stride_y - pad_y;
+    int end_x   = min(start_x + pool_size, upper_bound_w);
+    int end_y   = min(start_y + pool_size, upper_bound_h);
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    data0         = POOL_OP(data0, data2);
+    DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
new file mode 100644
index 0000000000..e0f3bf3468
--- /dev/null
+++ b/src/core/CL/cl_kernels/remap.cl
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+    map_coords += (float8)(0.5f);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
new file mode 100644
index 0000000000..9ef33b83ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height);
+    vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
new file mode 100644
index 0000000000..ef9878c1a3
--- /dev/null
+++ b/src/core/CL/cl_kernels/scharr_filter.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes Scharr3x3.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scharr3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-3);
+    gy += middle * (short8)(-10);
+    gy += right * (short8)(-3);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-10);
+    gx += right * (short8)(+10);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+3);
+    gy += middle * (short8)(+10);
+    gy += right * (short8)(+3);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
new file mode 100644
index 0000000000..4eb0eef770
--- /dev/null
+++ b/src/core/CL/cl_kernels/sobel_filter.cl
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/***********************************************/
+/*   Begin implementation of Sobel3x3 filter   */
+/***********************************************/
+
+/** This OpenCL kernel that computes a Sobel3x3 filter.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-1);
+    gy += middle * (short8)(-2);
+    gy += right * (short8)(-1);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-2);
+    gx += right * (short8)(+2);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+1);
+    gy += middle * (short8)(+2);
+    gy += right * (short8)(+1);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel3x3 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel5x5 filter   */
+/***********************************************/
+
+/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src             Pointer to source image.
+ * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
+ * @param[in] left2_coeff_gx  Weight of the left pixel for gx
+ * @param[in] middle_coeff_gx Weight of the middle pixel for gx
+ * @param[in] right1_coeff_gx Weight of the right pixel for gx
+ * @param[in] right2_coeff_gx Weight of the most right pixel for gx
+ * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
+ * @param[in] left2_coeff_gy  Weight of the left pixel for gy
+ * @param[in] middle_coeff_gy Weight of the middle pixel for gy
+ * @param[in] right1_coeff_gy Weight of the right pixel for gy
+ * @param[in] right2_coeff_gy Weight of the most right pixel for gy
+ *
+ * @return a short16 containing short8 gx and short8 gy values.
+ */
+short16 sobel1x5(
+    Image      *src,
+    const short left1_coeff_gx,
+    const short left2_coeff_gx,
+    const short middle_coeff_gx,
+    const short right1_coeff_gx,
+    const short right2_coeff_gx,
+    const short left1_coeff_gy,
+    const short left2_coeff_gy,
+    const short middle_coeff_gy,
+    const short right1_coeff_gy,
+    const short right2_coeff_gy)
+{
+    uchar16 temp = vload16(0, offset(src, -2, 0));
+    short8  gx   = 0;
+    short8  gy   = 0;
+    short8  val;
+
+    val = convert_short8(temp.s01234567);
+    gx += val * (short8)left1_coeff_gx;
+    gy += val * (short8)left1_coeff_gy;
+
+    val = convert_short8(temp.s12345678);
+    gx += val * (short8)left2_coeff_gx;
+    gy += val * (short8)left2_coeff_gy;
+
+    val = convert_short8(temp.s23456789);
+    gx += val * (short8)middle_coeff_gx;
+    gy += val * (short8)middle_coeff_gy;
+
+    val = convert_short8(temp.s3456789a);
+    gx += val * (short8)right1_coeff_gx;
+    gy += val * (short8)right1_coeff_gy;
+
+    val = convert_short8(temp.s456789ab);
+    gx += val * (short8)right2_coeff_gx;
+    gy += val * (short8)right2_coeff_gy;
+
+    return (short16)(gx, gy);
+}
+
+/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+short8 sobel5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    short8 val;
+    short8 out = (short8)0;
+
+    val = vload8(0, (__global short *)offset(src, 0, -2));
+    out += val * (short8)up1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, -1));
+    out += val * (short8)up2_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 0));
+    out += val * (short8)middle_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 1));
+    out += val * (short8)down1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 2));
+    out += val * (short8)down2_coeff;
+
+    return (short8)(out);
+}
+
+/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x5(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+    short16 gx_gy = sobel1x5(&src,
+                             -1, -2, 0, 2, 1,
+                             1, 4, 6, 4, 1);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
+ *  and output two single channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable5x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short8 gx = sobel5x1(&src_x,
+                         1, 4, 6, 4, 1);
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short8 gy = sobel5x1(&src_y,
+                         -1, -2, 0, 2, 1);
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel5x5 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel7x7 filter   */
+/***********************************************/
+
+/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
+#define X0 -1
+#define X1 -4
+#define X2 -5
+#define X3 0
+#define X4 5
+#define X5 4
+#define X6 1
+
+/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
+#define Y0 1
+#define Y1 6
+#define Y2 15
+#define Y3 20
+#define Y4 15
+#define Y5 6
+#define Y6 1
+
+/* Calculates single horizontal iteration. */
+#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
+    {                                                                \
+        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
+        gx += val * X##idx;                                          \
+        gy += val * Y##idx;                                          \
+    }
+
+/* Calculates single vertical iteration. */
+#define SOBEL1x1_VERT(src, g, direction, idx)                          \
+    {                                                                  \
+        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
+        g += val * (int8)direction##idx;                               \
+    }
+
+/* Calculates a 1x7 horizontal iteration. */
+#define SOBEL1x7(ptr, gx, gy)                        \
+    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
+    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
+    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
+    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
+    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
+    SOBEL1x1_HOR(ptr, gx, gy, 5) \
+    SOBEL1x1_HOR(ptr, gx, gy, 6)
+
+/* Calculates a 7x1 vertical iteration. */
+#define SOBEL7x1(ptr, g, direction)                         \
+    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
+    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
+    SOBEL1x1_VERT(ptr, g, direction, 2)             \
+    SOBEL1x1_VERT(ptr, g, direction, 3)         \
+    SOBEL1x1_VERT(ptr, g, direction, 4)     \
+    SOBEL1x1_VERT(ptr, g, direction, 5) \
+    SOBEL1x1_VERT(ptr, g, direction, 6)
+
+/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x7(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+    int8 gx = (int8)0;
+    int8 gy = (int8)0;
+
+    SOBEL1x7(&src, gx, gy);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable7x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    int8 gx = 0;
+    SOBEL7x1(&src_x, gx, Y);
+    vstore8(gx, 0, (__global int *)dst_gx.ptr);
+#endif
+#ifdef GRAD_Y
+    int8 gy = 0;
+    SOBEL7x1(&src_y, gy, X);
+    vstore8(gy, 0, (__global int *)dst_gy.ptr);
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel7x7 filter   */
+/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
new file mode 100644
index 0000000000..632b4a5374
--- /dev/null
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined USE_F16
+#define MINVAL HALF_MIN
+#define SELECT_DATA_TYPE short
+#define DATA_TYPE half
+#else
+#define MINVAL FLT_MIN
+#define SELECT_DATA_TYPE int
+#define DATA_TYPE float
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min;
+
+    // Calculate max of row
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data    = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        max_val = max(data, max_val);
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx    = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    max_val = max(max_val, select(type_min, data, widx));
+#endif
+
+    // Perform max reduction
+    max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF);
+    max_val.s0123     = max(max_val.s0123, max_val.s4567);
+    max_val.s01       = max(max_val.s01, max_val.s23);
+    max_val.s0        = max(max_val.s0, max_val.s1);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: F16, F32
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_shift_exp_sum(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(max),
+    IMAGE_DECLARATION(dst),
+    IMAGE_DECLARATION(sum),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        data = exp(data - max_val);
+        vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
+        sum1D += data;
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    data = exp(data - max_val);
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    data = select(0, data, widx);
+    vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
+    sum1D += data;
+#endif
+
+    // Perform min/max reduction
+    sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF;
+    sum1D.s0123     = sum1D.s0123 + sum1D.s4567;
+    sum1D.s01       = sum1D.s01 + sum1D.s23;
+    sum1D.s0        = sum1D.s0 + sum1D.s1;
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
+    vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+}
diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
new file mode 100644
index 0000000000..cee116bd75
--- /dev/null
+++ b/src/core/CL/cl_kernels/tablelookup.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs table lookup on U8 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: U8
+ */
+__kernel void tablelookup_U8(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global uchar *lut)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    uchar8 data = vload8(0, src.ptr);
+
+    /* Load lut data */
+    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
+                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
+
+    /* Store result */
+    vstore8(lut_data, 0, dst.ptr);
+}
+
+/** This function performs table lookup on S16 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: S16
+ * @param[in]  offset                            LUT offset
+ * @param[in]  count                             Number of elements in the LUT
+ */
+__kernel void tablelookup_S16(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global short *lut,
+    uint            offset,
+    uint            count)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    short8 data = vload8(0, (__global short *)src.ptr);
+
+    /* Load output data */
+    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
+
+    /* Calculate index */
+    int8 index = convert_int8(data) + (int8)(offset);
+    int8 cond  = (index >= 0 && index < (int8)count);
+    index      = select(0, index, cond);
+
+    /* Load lut data */
+    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
+                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
+
+    /* Select output data depending on condition */
+    lut_data = select(out_data, lut_data, cond);
+
+    /* Store result */
+    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
new file mode 100644
index 0000000000..2b1e6ff35d
--- /dev/null
+++ b/src/core/CL/cl_kernels/threshold.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform binary thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  threshold                         The thresold value
+ */
+__kernel void threshold_binary(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar threshold)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform binary thresholding
+    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
+
+/** Perform range thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  lower                             Lower threshold
+ * @param[in]  upper                             Upper threshold
+ */
+__kernel void threshold_range(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar lower,
+    const uchar upper)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform range thresholding
+    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
new file mode 100644
index 0000000000..c30158f280
--- /dev/null
+++ b/src/core/CL/cl_kernels/transpose.cl
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SWAP_ROW(u0, l0)     \
+    ({                       \
+        tmp_swap = u0;       \
+        u0       = l0;       \
+        l0       = tmp_swap; \
+    })
+
+#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
+    ({                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 4)              \
+        tmp_swap;                                \
+        SWAP_ROW(u0, l0);                        \
+        SWAP_ROW(u1, l1);                        \
+        SWAP_ROW(u2, l2);                        \
+        SWAP_ROW(u3, l3);                        \
+    })
+
+#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \
+    ({                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                              \
+        tmp_swap;                                                                \
+        SWAP_ROW(u0, l0);                                                        \
+        SWAP_ROW(u1, l1);                                                        \
+        SWAP_ROW(u2, l2);                                                        \
+        SWAP_ROW(u3, l3);                                                        \
+        SWAP_ROW(u4, l4);                                                        \
+        SWAP_ROW(u5, l5);                                                        \
+        SWAP_ROW(u6, l6);                                                        \
+        SWAP_ROW(u7, l7);                                                        \
+    })
+
+#define TRANSPOSE_4x4(u0, u1, u2, u3) \
+    ({                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)   \
+        tmp;                          \
+        tmp.s012 = u0.s123;           \
+        u0.s1    = u1.s0;             \
+        u0.s2    = u2.s0;             \
+        u0.s3    = u3.s0;             \
+        u1.s0    = tmp.s0;            \
+        u2.s0    = tmp.s1;            \
+        u3.s0    = tmp.s2;            \
+        \
+        tmp.s01 = u1.s23;             \
+        u1.s2   = u2.s1;              \
+        u1.s3   = u3.s1;              \
+        u2.s1   = tmp.s0;             \
+        u3.s1   = tmp.s1;             \
+        \
+        tmp.s0 = u2.s3;               \
+        u2.s3  = u3.s2;               \
+        u3.s2  = tmp.s0;              \
+    })
+
+#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)                                             \
+    ({                                                                                            \
+        TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123);                                    \
+        TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567);                                    \
+        TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123);                                    \
+        TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567);                                    \
+        SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \
+    })
+
+#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)                                                \
+    ({                                                                                                                                       \
+        TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567);       \
+        TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF);       \
+        TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \
+        TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \
+        SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF,             \
+                 u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567);      \
+    })
+
+#ifndef DATA_TYPE_IN_BYTES
+#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
+#endif
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3)
+#define VLOAD(x, y) vload4(x, y)
+#define VSTORE(x, y, z) vstore4(x, y, z)
+#define BLOCK_SIZE 4
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)
+#define VLOAD(x, y) vload8(x, y)
+#define VSTORE(x, y, z) vstore8(x, y, z)
+#define BLOCK_SIZE 8
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)
+#define VLOAD(x, y) vload16(x, y)
+#define VSTORE(x, y, z) vstore16(x, y, z)
+#define BLOCK_SIZE 16
+#else
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(IMAGE_DECLARATION(src),
+                        IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0) * BLOCK_SIZE;
+    uint y = get_global_id(1) * BLOCK_SIZE;
+
+    // Compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Load the NxN block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+#if BLOCK_SIZE > 4
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7)));
+#if BLOCK_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15)));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+    // Transpose the block
+    TRANSPOSE();
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes;
+    VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y));
+    VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y));
+    VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y));
+    VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y));
+#if BLOCK_SIZE > 4
+    VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y));
+    VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y));
+    VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y));
+    VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y));
+#if BLOCK_SIZE == 16
+    VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y));
+    VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y));
+    VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y));
+    VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y));
+    VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y));
+    VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y));
+    VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y));
+    VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
new file mode 100644
index 0000000000..87736465d2
--- /dev/null
+++ b/src/core/CL/cl_kernels/types.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
new file mode 100644
index 0000000000..0a4748f452
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns a vector of floats contaning the matrix coefficients. */
+inline const float8 build_affine_mtx()
+{
+    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
+}
+
+/** Transforms 4 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   affine matrix
+ *
+ * @return a int8 containing 4 2D transformed values.
+ */
+inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   output(x,y) = input(x0,y0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
new file mode 100644
index 0000000000..26a8b859a4
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    const float4 clamped_x = clamp(coords.even, -1.0f, width);
+    const float4 clamped_y = clamp(coords.odd, -1.0f, height);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+*/
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+*/
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height));
+    const float16 c2 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height));
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
new file mode 100644
index 0000000000..863b6c9e96
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_perspective.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns the perspective matrix */
+inline const float16 build_perspective_mtx()
+{
+    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
+}
+
+/** Transforms four 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   (x0/z0,y0/z0)
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   perspective matrix
+ *
+ * @return a vector float8 containing four 2D transformed values.
+ */
+inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [z,z+1,z+2,z+3]
+    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+
+ *   output(x,y) = input(x0/z0,y0/z0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr);
+}
author	Anthony Barbier <anthony.barbier@arm.com>	2017-09-04 18:44:23 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-09-17 13:03:09 +0100
commit	6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree	a7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src/core/CL/cl_kernels
download	ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz