From 33ff9ef467153eef05b700820d859515a52481f4 Mon Sep 17 00:00:00 2001
From: Xinghang Zhou <xinghang.zhou@arm.com>
Date: Wed, 17 Jan 2018 11:23:39 +0800
Subject: APPBROWSER-400: Implement the tensorshift kernel for fixing DC's
 alignment issue on OpenGL ES

Change-Id: I7a8489bb0fddc72899ea165e414ee87bdbfb45b3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118106
Reviewed-by: Joel Liang <joel.liang@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/GLES_COMPUTE/GCKernels.h          |   1 +
 .../GLES_COMPUTE/kernels/GCTensorShiftKernel.h     |  83 +++++++
 arm_compute/runtime/GLES_COMPUTE/GCFunctions.h     |   1 +
 .../functions/GCDirectConvolutionLayer.h           |  25 +-
 .../runtime/GLES_COMPUTE/functions/GCTensorShift.h |  51 ++++
 src/core/GLES_COMPUTE/GCKernelLibrary.cpp          |   5 +
 src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs   | 134 ++++++++++
 .../kernels/GCDirectConvolutionLayerKernel.cpp     |  22 +-
 .../GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp   | 108 +++++++++
 .../functions/GCDirectConvolutionLayer.cpp         |  19 +-
 .../GLES_COMPUTE/functions/GCTensorShift.cpp       |  40 +++
 tests/datasets/ShapeDatasets.h                     |  22 ++
 .../DirectConvolutionLayerTensorShift.cpp          |  90 +++++++
 .../DirectConvolutionLayerTensorShiftFixture.h     | 269 +++++++++++++++++++++
 14 files changed, 859 insertions(+), 11 deletions(-)
 create mode 100644 arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
 create mode 100644 arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
 create mode 100644 src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
 create mode 100644 src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
 create mode 100644 src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
 create mode 100644 tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp
 create mode 100644 tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h

diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h
index a1f3c278c4..40312d121a 100644
--- a/arm_compute/core/GLES_COMPUTE/GCKernels.h
+++ b/arm_compute/core/GLES_COMPUTE/GCKernels.h
@@ -47,6 +47,7 @@
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
 
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
new file mode 100644
index 0000000000..5f108764b4
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCTENSORSHIFTKERNEL_H__
+#define __ARM_COMPUTE_GCTENSORSHIFTKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+/** Interface for the kernel to shift valid data on a tensor.
+ *
+ * For example shifting 3x3 valid data with padding of 1 to right:
+ * @f[
+ * \left( \begin{array}{ccccc}
+ * 0   & 0   & 0   & 0 & 0 \\
+ * a00 & a01 & a02 & 0 & 0 \\
+ * a10 & a11 & a12 & 0 & 0 \\
+ * a20 & a21 & a22 & 0 & 0 \\
+ * 0   & 0   & 0   & 0 & 0 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccc}
+ * 0  & 0   & 0   & 0   & 0 \\
+ * 0  & a00 & a01 & a02 & 0 \\
+ * 0  & a10 & a11 & a12 & 0 \\
+ * 0  & a20 & a21 & a22 & 0 \\
+ * 0  & 0   & 0   & 0   & 0 \\
+ * \end{array} \right)
+ * @f]
+ */
+class GCTensorShiftKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCTensorShiftKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCTensorShiftKernel(const GCTensorShiftKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCTensorShiftKernel &operator=(const GCTensorShiftKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCTensorShiftKernel(GCTensorShiftKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCTensorShiftKernel &operator=(GCTensorShiftKernel &&) = default;
+    /** Default destructor */
+    ~GCTensorShiftKernel() = default;
+    /** Set the input of the kernel.
+     *
+     * @param[in,out] input Source tensor. Data types supported: F16/F32
+     */
+    void configure(IGCTensor *input);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    IGCTensor    *_input;
+    gles::NDRange _lws;
+};
+}
+#endif /*__ARM_COMPUTE_GCTENSORSHIFTKERNEL_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
index bbd8218722..6f338568c2 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
@@ -45,6 +45,7 @@
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
 
 #endif /* __ARM_COMPUTE_GCFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
index 5472bdb9ea..c6b948be1f 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,12 @@
 #ifndef __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__
 #define __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__
 
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -34,14 +37,20 @@ namespace arm_compute
 {
 class IGCTensor;
 
-/** Basic function to execute direct convolution function:
+/** Basic function to execute direct convolution function. This function calls the following kernels:
+ *
+ * -# @ref GCDirectConvolutionLayerKernel
+ * -# @ref GCFillBorderKernel
+ * -# @ref GCTensorShiftKernel
  *
  * @note Supported kernel size: 1x1, 3x3, and 5x5
  * @note This OpenGL ES implementation works with stride_x = 1 and 2
  */
-class GCDirectConvolutionLayer : public IGCSimpleFunction
+class GCDirectConvolutionLayer : public IFunction
 {
 public:
+    /** Default constructor */
+    GCDirectConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -54,6 +63,14 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override final;
+
+private:
+    std::unique_ptr<IGCKernel> _kernel;
+    GCFillBorderKernel         _border_handler;
+    GCTensorShiftKernel        _shift_handler;
 };
 }
 #endif /* __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
new file mode 100644
index 0000000000..dfcec57044
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCTENSORSHIFT_H__
+#define __ARM_COMPUTE_GCTENSORSHIFT_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to execute shift function for tensor. This function applies to fix alignment issue on OpenGL ES:
+ *
+ * @note This alignment issue is introduced by limits of compute shader which requires 32/64/128bit alignment for data access on OpenGL ES
+ */
+class GCTensorShift : public IGCSimpleFunction
+{
+public:
+    /** Initialise the kernel's input, output.
+     *
+     * @param[in,out] input Source tensor. Data types supported: F16/F32.
+     */
+    void configure(IGCTensor *input);
+};
+}
+#endif /* __ARM_COMPUTE_GCTENSORSHIFT_H__ */
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 0b9cd3f4ee..d4ce3888fd 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -190,6 +190,7 @@ void GCKernel::update_shader_params()
 const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
 {
     { "absdiff", "absdiff.cs" },
+    { "tensorshift", "tensor_shift.cs" },
     { "direct_convolution1x1", "direct_convolution1x1.cs" },
     { "direct_convolution3x3", "direct_convolution3x3.cs" },
     { "direct_convolution5x5", "direct_convolution5x5.cs" },
@@ -233,6 +234,10 @@ const std::map<std::string, std::string> GCKernelLibrary::_program_source_map =
     {
         "absdiff.cs",
 #include "./cs_shaders/absdiff.csembed"
+    },
+    {
+        "tensor_shift.cs",
+#include "./cs_shaders/tensor_shift.csembed"
     },
     {
         "convolution_layer.cs",
diff --git a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
new file mode 100644
index 0000000000..a0af315c76
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a shift to move "pad_x" columns to the right.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The width must be passed at compile time using "#define WIDTH n" e.g. "#define WIDTH 1"
+ *
+ * @param[in,out] src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]     src_attrs The attributes of the source tensor
+ * @param[in]     pad_x     The padding of the source tensor in x dimension
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    uint               pad_x;
+};
+
+#if defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, restrict);
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    int              n        = int(pad_x) % 2;
+
+    if(n == 1)
+    {
+        int i = 0;
+        if((WIDTH % 2) == 1)
+        {
+            i = WIDTH + int(pad_x) - 2;
+        }
+        else
+        {
+            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 2))));
+            vec2 s_end  = vec2(s0_end.y, 0.f);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
+            i = WIDTH + int(pad_x) - 3;
+        }
+        for(; i >= (int(pad_x) + 1); i = i - 2)
+        {
+            vec2 s0 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) - 1))));
+            vec2 s1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) + 1))));
+            vec2 s  = vec2(s0.y, s1.x);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
+        }
+        for(int j = 0; j < (int(pad_x) - 1); j = j + 2)
+        {
+            vec2 s_origin = vec2(0.f);
+            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+        }
+        vec2 s0_origin = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+        vec2 s_origin  = vec2(0.f, s0_origin.x);
+        STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
+    }
+    else
+    {
+        int i = 0;
+        if((WIDTH % 2) == 0)
+        {
+            i = WIDTH + int(pad_x) - 2;
+        }
+        else
+        {
+            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 1))));
+            vec2 s_end  = vec2(s0_end.x, 0.f);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
+            i = WIDTH + int(pad_x) - 3;
+        }
+        for(; i >= (int(pad_x)); i = i - 2)
+        {
+            vec2 s = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x)))));
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
+        }
+        for(int j = 0; j < int(pad_x); j = j + 2)
+        {
+            vec2 s = vec2(0.f);
+            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+        }
+    }
+}
+#elif defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, restrict);
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+
+    for(int i = (WIDTH + int(pad_x) - 1); i >= int(pad_x); i--)
+    {
+        float sorigin = LOAD(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, (i - int(pad_x))));
+        STORE(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, i), sorigin);
+    }
+    for(int j = 0; j < int(pad_x); j++)
+    {
+        STORE_CURRENT_ITEM(src_ptr, src_iter, 0.f);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+    }
+}
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 23f1c2eada..fd461c53cd 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -317,12 +317,20 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
     const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
 
     // Calculate input right and bottom border
-    const int input_width    = input->info()->dimension(0);
-    const int input_height   = input->info()->dimension(1);
-    const int upper_bound_w  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
-    const int upper_bound_h  = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
-    const int padding_right  = std::max(upper_bound_w, _conv_pad_x);
-    const int padding_bottom = std::max(upper_bound_h, _conv_pad_y);
+    const int input_width        = input->info()->dimension(0);
+    const int input_height       = input->info()->dimension(1);
+    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_x)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_x));
+    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_y)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_y));
+    const int padding_right1     = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_x;
+    const int padding_bottom1    = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_y;
+
+    const int upper_bound_w   = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
+    const int upper_bound_h   = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
+    const int padding_right2  = std::max(upper_bound_w, _conv_pad_x);
+    const int padding_bottom2 = std::max(upper_bound_h, _conv_pad_y);
+
+    const int padding_right  = std::max(padding_right1, padding_right2);
+    const int padding_bottom = std::max(padding_bottom1, padding_bottom2);
 
     BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
 
@@ -406,6 +414,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
         add_1D_tensor_argument(idx1, _bias, 4, slice_bias);
     }
 
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx = 0;
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
new file mode 100644
index 0000000000..c2182171a6
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::gles_compute;
+
+GCTensorShiftKernel::GCTensorShiftKernel()
+    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
+{
+}
+
+void GCTensorShiftKernel::configure(IGCTensor *input)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    _input = input;
+
+    std::set<std::string> options;
+    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+    options.emplace("#define WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
+
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    options.emplace(("#define " + dt_name));
+
+    unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right;
+    unsigned int num_elems_written_per_iteration_y = 1;
+    unsigned int num_elems_written_per_iteration_z = 1;
+
+    std::stringstream kernel_name;
+    kernel_name << "tensorshift";
+
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    Window                 win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_written_per_iteration_x);
+
+    update_window_and_padding(win, input_access);
+
+    IGCKernel::configure(win);
+}
+
+void GCTensorShiftKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+    slice.shift(Window::DimX, -(_input->info()->padding()).left);
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, 1, slice);
+
+        const PaddingSize &padding1 = _input->info()->padding();
+
+        if(int(padding1.left) == 0)
+        {
+            break;
+        }
+
+        _kernel.set_argument(idx++, static_cast<unsigned int>(padding1.left));
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice, _lws);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index ae9dd51b8e..769733ca66 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,16 @@
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+GCDirectConvolutionLayer::GCDirectConvolutionLayer()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
 
 void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
 {
@@ -61,4 +67,15 @@ void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor
     }
 
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    _shift_handler.configure(output);
+}
+
+void GCDirectConvolutionLayer::run()
+{
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_shift_handler);
 }
diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
new file mode 100644
index 0000000000..93496f4b74
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void GCTensorShift::configure(IGCTensor *input)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCTensorShiftKernel>();
+    k->configure(input);
+    _kernel = std::move(k);
+}
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index dbcd9d5000..9114f514aa 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -387,6 +387,28 @@ public:
     }
 };
 
+/** Data set containing small tensor shapes for direct convolution. */
+class SmallDirectConvolutionTensorShiftShapes final : public ShapeDataset
+{
+public:
+    SmallDirectConvolutionTensorShiftShapes()
+        : ShapeDataset("InputShape",
+    {
+        // Batch size 1
+        TensorShape{ 35U, 35U, 3U },
+                     TensorShape{ 32U, 37U, 3U },
+                     // Batch size 4
+                     TensorShape{ 32U, 37U, 3U, 4U },
+                     // Batch size 8
+                     TensorShape{ 32U, 37U, 3U, 8U },
+                     TensorShape{ 33U, 35U, 3U, 8U },
+                     // Arbitrary batch size
+                     TensorShape{ 32U, 37U, 3U, 8U }
+    })
+    {
+    }
+};
+
 /** Data set containing 2D tensor shapes for DepthConcatenateLayer. */
 class DepthConcatenateLayerShapes final : public ShapeDataset
 {
diff --git a/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp b/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp
new file mode 100644
index 0000000000..45fb76cad5
--- /dev/null
+++ b/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<half>  tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */
+RelativeTolerance<float> tolerance_fp32(0.02f);     /**< Tolerance for floating point tests */
+constexpr float          tolerance_num = 0.07f;     /**< Tolerance number */
+
+/** Direct convolution data set. */
+const auto data = combine(datasets::SmallDirectConvolutionTensorShiftShapes(),
+                          combine(framework::dataset::make("StrideX", 1, 3),
+                                  combine(framework::dataset::make("StrideY", 1, 3),
+                                          combine(concat(combine(framework::dataset::make("PadX", 0),
+                                                                 combine(framework::dataset::make("PadY", 0),
+                                                                         framework::dataset::make("KernelSize", 1))),
+                                                         combine(framework::dataset::make("PadX", 0, 2),
+                                                                 combine(framework::dataset::make("PadY", 0, 2),
+                                                                         framework::dataset::make("KernelSize", { 3, 5 })))),
+                                                  framework::dataset::make("NumKernels", { 3 })))));
+} // namespace
+
+TEST_SUITE(GC)
+TEST_SUITE(DirectConvolutionLayerTensorShift)
+
+template <typename T>
+using GCDirectConvolutionLayerTensorShiftFixture = DirectConvolutionValidationTensorShiftFixture<GCTensor, GCAccessor, GCDirectConvolutionLayer, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerTensorShiftFixture<half_float::half>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerTensorShiftFixture<float>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
new file mode 100644
index 0000000000..d810a765cb
--- /dev/null
+++ b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/fixtures/ConvolutionLayerFixture.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationGenericTensorShiftFixture : public framework::Fixture
+{
+public:
+    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value, int32_t, T>::type;
+
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
+               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+    {
+        _fractional_bits   = fractional_bits;
+        _quantization_info = quantization_info;
+        _data_type         = data_type;
+
+        const TensorShape   weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
+        const TensorShape   bias_shape(num_kernels);
+        const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
+        const TensorShape   output_shape   = get_output_shape(input_shape, weights_shape, info);
+        const DataType      bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+    }
+
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+    {
+        _fractional_bits   = fractional_bits;
+        _quantization_info = quantization_info;
+        _data_type         = data_type;
+
+        const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                std::uniform_int_distribution<uint8_t> distribution(0, 50);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F16:
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::S32:
+            {
+                std::uniform_int_distribution<int32_t> distribution(-5, 5);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info)
+    {
+        // Create tensors
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, fixed_point_position, quantization_info);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, fixed_point_position, quantization_info);
+
+        TensorShape output_shape1 = get_output_shape(output_shape, weights_shape, info);
+        TensorType  dst1          = create_tensor<TensorType>(output_shape1, data_type, 1, fixed_point_position, quantization_info);
+
+        // Create and configure function
+        FunctionType conv;
+        conv.configure(&src, &weights, &bias, &dst, info);
+        FunctionType conv1;
+        conv1.configure(&dst, &weights, &bias, &dst1, info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst1.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        dst1.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst1.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0);
+        fill(AccessorType(weights), 1);
+        fill(AccessorType(bias), 2);
+
+        // Compute NEConvolutionLayer function
+        GCScheduler::get().memory_barrier();
+        conv.run();
+        GCScheduler::get().memory_barrier();
+        conv1.run();
+
+        return dst1;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info)
+    {
+        // Create reference
+        SimpleTensor<T>     src{ input_shape, data_type, 1, fixed_point_position, quantization_info };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, fixed_point_position, quantization_info };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, fixed_point_position, quantization_info };
+
+        SimpleTensor<T> dst{ output_shape, data_type, 1, fixed_point_position, quantization_info };
+        TensorShape     output_shape1 = get_output_shape(output_shape, weights_shape, info);
+
+        // Fill reference
+        fill(src, 0);
+        fill(weights, 1);
+        fill(bias, 2);
+
+        dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        return reference::convolution_layer<T>(dst, weights, bias, output_shape1, info);
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    int              _fractional_bits{};
+    QuantizationInfo _quantization_info{};
+    DataType         _data_type{};
+
+private:
+    TensorShape get_output_shape(TensorShape in_shape, TensorShape kernel_shape, const PadStrideInfo &info)
+    {
+        TensorShape out_shape(in_shape);
+        const std::pair<unsigned int, unsigned int> scaled_dims = scaled_dimensions(in_shape.x(),
+                                                                                    in_shape.y(),
+                                                                                    kernel_shape.x(),
+                                                                                    kernel_shape.y(),
+                                                                                    info);
+        out_shape.set(0, scaled_dims.first);
+        out_shape.set(1, scaled_dims.second);
+        out_shape.set(2, kernel_shape[3]);
+        return out_shape;
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type)
+    {
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0,
+                                                                                                               QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationFixedPointTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, int fractional_bits)
+    {
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type,
+                                                                                                               fractional_bits,
+                                                                                                               QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationQuantizedTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info)
+    {
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0,
+                                                                                                               quantization_info);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationWithTensorShapesQuantizedTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+               DataType data_type, QuantizationInfo quantization_info)
+    {
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, quantization_info);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolutionValidationWithTensorShapesTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+               DataType data_type)
+    {
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, QuantizationInfo());
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
-- 
cgit v1.2.1