From a1ed41fe2427dfa2b5d0139444ceb77ad16a5a73 Mon Sep 17 00:00:00 2001
From: Jaroslaw Rzepecki <jaroslaw.rzepecki@arm.com>
Date: Fri, 13 Oct 2017 11:13:58 +0100
Subject: IVGCVSW-601: support for asymetric padding in cl conv and depthwise
 conv

Change-Id: I5c6c95091ae77dba96459c0640f9f6167a988c8c
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/91700
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
---
 arm_compute/core/Types.h                          | 60 +++++++++++++++++++++--
 src/core/CL/cl_kernels/convolution_layer.cl       | 14 +++---
 src/core/CL/cl_kernels/depthwise_convolution.cl   | 12 ++---
 src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp   |  6 ++-
 src/core/CL/kernels/CLIm2ColKernel.cpp            | 14 +++---
 src/core/NEON/kernels/NEIm2ColKernel.cpp          | 24 +++++----
 src/core/Utils.cpp                                | 30 ++++++------
 src/runtime/CL/functions/CLConvolutionLayer.cpp   |  3 --
 src/runtime/NEON/functions/NEConvolutionLayer.cpp |  3 --
 tests/datasets/DepthwiseConvolutionDataset.h      |  7 +++
 tests/datasets/SmallConvolutionLayerDataset.h     |  8 +++
 tests/validation/CPP/ConvolutionLayer.cpp         | 30 +++++++-----
 tests/validation/CPP/DepthwiseConvolution.cpp     | 38 ++++++++------
 utils/TypePrinter.h                               |  3 +-
 14 files changed, 166 insertions(+), 86 deletions(-)
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f9766b39be..f52dd12597 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -418,7 +418,32 @@ public:
                   unsigned int pad_x = 0, unsigned int pad_y = 0,
                   DimensionRoundingType round = DimensionRoundingType::FLOOR)
         : _stride(std::make_pair(stride_x, stride_y)),
-          _pad(std::make_pair(pad_x, pad_y)),
+          _pad_left(pad_x),
+          _pad_top(pad_y),
+          _pad_right(pad_x),
+          _pad_bottom(pad_y),
+          _round_type(round)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] stride_x   Stride, in elements, across x.
+     * @param[in] stride_y   Stride, in elements, across y.
+     * @param[in] pad_left   Padding across x on the left, in elements.
+     * @param[in] pad_top    Padding across y on the top, in elements.
+     * @param[in] pad_right  Padding across x on the right, in elements.
+     * @param[in] pad_bottom Padding across y on the bottom, in elements.
+     * @param[in] round      Dimensions rounding.
+     */
+    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
+                  unsigned int pad_left, unsigned int pad_right,
+                  unsigned int pad_top, unsigned int pad_bottom,
+                  DimensionRoundingType round)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_left),
+          _pad_top(pad_top),
+          _pad_right(pad_right),
+          _pad_bottom(pad_bottom),
           _round_type(round)
     {
     }
@@ -428,16 +453,45 @@ public:
     }
     std::pair<unsigned int, unsigned int> pad() const
     {
-        return _pad;
+        //this accessor should be used only when padding is symmetric
+        ARM_COMPUTE_ERROR_ON(_pad_left != _pad_right || _pad_top != _pad_bottom);
+        return std::make_pair(_pad_left, _pad_top);
     }
+
+    unsigned int pad_left() const
+    {
+        return _pad_left;
+    }
+    unsigned int pad_right() const
+    {
+        return _pad_right;
+    }
+    unsigned int pad_top() const
+    {
+        return _pad_top;
+    }
+    unsigned int pad_bottom() const
+    {
+        return _pad_bottom;
+    }
+
     DimensionRoundingType round() const
     {
         return _round_type;
     }
 
+    bool has_padding() const
+    {
+        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
+    }
+
 private:
     std::pair<unsigned int, unsigned int> _stride;
-    std::pair<unsigned int, unsigned int> _pad;
+    unsigned int _pad_left;
+    unsigned int _pad_top;
+    unsigned int _pad_right;
+    unsigned int _pad_bottom;
+
     DimensionRoundingType _round_type;
 };
 
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 9e9d0b0ccc..e3018461e3 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -97,7 +97,7 @@ __kernel void reshape_to_columns(
     }
 }
 
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
@@ -134,8 +134,8 @@ __kernel void im2col_generic(
     const int batch = get_global_id(2) / filter_depth; // the batch
 
     // Calculate input indeces
-    const int xi = xc * STRIDE_X - PAD_X;
-    const int yi = yc * STRIDE_Y - PAD_Y;
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
 
     // Calculate output indeces
     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
@@ -149,9 +149,9 @@ __kernel void im2col_generic(
     {
         for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
         {
-#if PAD_X == 0 && PAD_Y == 0
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else  // PAD_X == 0 && PAD_Y == 0
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
             {
                 *output_ptr = 0;
@@ -160,7 +160,7 @@ __kernel void im2col_generic(
             {
                 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
             }
-#endif // PAD_X == 0 && PAD_Y == 0
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
         }
     }
 
@@ -245,7 +245,7 @@ __kernel void im2col_kernel3x3_padx0_pady0(
     }
 #endif // HAS_BIAS
 }
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 
 #if defined(WIDTH_OUTPUT)
 /** This kernel performs a reshaping of the output of the convolution layer.
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 9c2c3a5b37..081a4e6c44 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -226,11 +226,11 @@ __kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARA
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -255,11 +255,11 @@ __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(d
     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
     const int src_pixel_linear = get_global_id(1) * STRIDE_X;
-    const int full_length      = SRC_WIDTH + 2 * PAD_X;
+    const int full_length      = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
     const int max_initial_x    = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
 
-    const int src_x = -PAD_X + src_pixel_linear % max_initial_x;
-    const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y;
+    const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
+    const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
     const int src_z = get_global_id(2);
 
     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
@@ -281,7 +281,7 @@ __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(d
     }
 }
 
-#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
 
 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 0eaadb80c6..5c7fe7e0b4 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -58,8 +58,10 @@ void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *outpu
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
     build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
-    build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+    build_opts.emplace("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.emplace("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+    build_opts.emplace("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+    build_opts.emplace("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
     build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 98a799f783..6cc45573d8 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -63,18 +63,16 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
         build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
     }
 
-    int pad_x    = 0;
-    int pad_y    = 0;
     int stride_x = 0;
     int stride_y = 0;
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+
     std::tie(stride_x, stride_y) = conv_info.stride();
 
     const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                     input->info()->tensor_shape().cend(),
                                                     output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
     if(!run_img2col_reduced)
     {
@@ -90,12 +88,14 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
         build_opts.emplace("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
         build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
         build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-        build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
-        build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+        build_opts.emplace("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_opts.emplace("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_opts.emplace("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+        build_opts.emplace("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
 
-        if(kernel_dims.width == 3 && kernel_dims.height == 3 && conv_info.pad().first == 0 && conv_info.pad().second == 0)
+        if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding())
         {
             _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_kernel3x3_padx0_pady0", build_opts));
         }
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 71910e3a69..1c018b269b 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -163,16 +163,17 @@ void NEIm2ColKernel::run_generic(const Window &window)
     const int input_stride_y = _input->info()->strides_in_bytes().y();
     const int input_stride_z = _input->info()->strides_in_bytes().z();
 
-    int pad_x    = 0;
-    int pad_y    = 0;
+    int pad_left = 0;
+    int pad_top  = 0;
     int stride_x = 0;
     int stride_y = 0;
-    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    pad_left     = _conv_info.pad_left();
+    pad_top      = _conv_info.pad_top();
     std::tie(stride_x, stride_y) = _conv_info.stride();
 
     // Setup input window
-    const int start_x = -pad_x;
-    const int start_y = -pad_y;
+    const int start_x = -pad_left;
+    const int start_y = -pad_top;
 
     Window window_in(window);
     // The first three dimensions of the input are increased by the inner loops
@@ -291,18 +292,15 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
                                         _conv_info);
     _has_bias = has_bias;
 
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    std::tie(pad_x, pad_y)       = conv_info.pad();
     std::tie(stride_x, stride_y) = conv_info.stride();
 
     bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                               input->info()->tensor_shape().cend(),
                                               output->info()->tensor_shape().cbegin() + 1))
-                               && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
     Window window = calculate_max_window(*input->info(), Steps());
 
@@ -334,18 +332,18 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
         switch(_input->info()->data_type())
         {
             case DataType::F32:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
                 break;
 #ifdef ARM_COMPUTE_ENABLE_FP16
             case DataType::F16:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
                 break;
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
             case DataType::QS8:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
                 break;
             case DataType::QS16:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index d5ce1ea027..0a35e07430 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -288,37 +288,39 @@ const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsig
                                                                            unsigned int kernel_width, unsigned int kernel_height,
                                                                            const PadStrideInfo &pad_stride_info)
 {
-    const unsigned int pad_x    = pad_stride_info.pad().first;
-    const unsigned int pad_y    = pad_stride_info.pad().second;
-    const unsigned int stride_x = pad_stride_info.stride().first;
-    const unsigned int stride_y = pad_stride_info.stride().second;
-    unsigned int       w        = 0;
-    unsigned int       h        = 0;
+    const unsigned int pad_left   = pad_stride_info.pad_left();
+    const unsigned int pad_top    = pad_stride_info.pad_top();
+    const unsigned int pad_right  = pad_stride_info.pad_right();
+    const unsigned int pad_bottom = pad_stride_info.pad_bottom();
+    const unsigned int stride_x   = pad_stride_info.stride().first;
+    const unsigned int stride_y   = pad_stride_info.stride().second;
+    unsigned int       w          = 0;
+    unsigned int       h          = 0;
     switch(pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
     }
 
     // Make sure that border operations will start from inside the input and not the padded area
-    if(((w - 1) * stride_x) >= (width + pad_x))
+    if(((w - 1) * stride_x) >= (width + pad_left))
     {
         --w;
     }
-    if(((h - 1) * stride_y) >= (height + pad_y))
+    if(((h - 1) * stride_y) >= (height + pad_top))
     {
         --h;
     }
-    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_x));
-    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_y));
+    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_left));
+    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_top));
 
     return std::make_pair(w, h);
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 4b1bfd8b8f..a3be6f4144 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -128,10 +128,7 @@ void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weig
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index f34f497436..155f4e561a 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -136,10 +136,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
diff --git a/tests/datasets/DepthwiseConvolutionDataset.h b/tests/datasets/DepthwiseConvolutionDataset.h
index 593b8238d7..8cceae0083 100644
--- a/tests/datasets/DepthwiseConvolutionDataset.h
+++ b/tests/datasets/DepthwiseConvolutionDataset.h
@@ -125,6 +125,13 @@ public:
         add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(23U, 27U, 5U), TensorShape(11U, 3U, 5U), TensorShape(13U, 13U, 5U), PadStrideInfo(1, 2, 0, 0));
         add_config(TensorShape(17U, 31U, 2U, 3U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U, 3U), PadStrideInfo(1, 2, 1, 1));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index 8eda2e87fe..aa9d9f8899 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -58,6 +58,14 @@ public:
         add_config(TensorShape(17U, 31U, 2U, 4U), TensorShape(5U, 3U, 2U, 19U), TensorShape(19U), TensorShape(15U, 16U, 19U, 4U), PadStrideInfo(1, 2, 1, 1));
         // Arbitrary batch size
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0));
+
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CPP/ConvolutionLayer.cpp b/tests/validation/CPP/ConvolutionLayer.cpp
index 656cd2ee26..ab3690a493 100644
--- a/tests/validation/CPP/ConvolutionLayer.cpp
+++ b/tests/validation/CPP/ConvolutionLayer.cpp
@@ -26,6 +26,8 @@
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
+#include "tests/framework/Asserts.h"
+
 namespace arm_compute
 {
 namespace test
@@ -149,21 +151,24 @@ SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor
     const int width_weights  = weights.shape().x();
     const int height_weights = weights.shape().y();
     const int depth_weights  = weights.shape().z();
-    const int pad_xi         = std::min(static_cast<int>(info.pad().first), width_weights / 2);
-    const int pad_yi         = std::min(static_cast<int>(info.pad().second), height_weights / 2);
-    const int start_xi       = width_weights / 2 - pad_xi;
-    const int start_yi       = height_weights / 2 - pad_yi;
-    const int end_xi         = width_in - start_xi;
-    const int end_yi         = height_in - start_yi;
-    const int stride_xi      = info.stride().first;
-    const int stride_yi      = info.stride().second;
-    const int num_batches    = src.shape().total_size() / (width_in * height_in * depth_in);
+    const int pad_left       = std::min(static_cast<int>(info.pad_left()), width_weights / 2);
+    const int pad_top        = std::min(static_cast<int>(info.pad_top()), height_weights / 2);
+    const int pad_right      = std::min(static_cast<int>(info.pad_right()), width_weights / 2);
+    const int pad_bottom     = std::min(static_cast<int>(info.pad_bottom()), height_weights / 2);
+
+    const int start_xi    = width_weights / 2 - pad_left;
+    const int start_yi    = height_weights / 2 - pad_top;
+    const int end_xi      = width_in + pad_left - width_weights / 2 + pad_right - width_weights / 2;
+    const int end_yi      = height_in + pad_top - height_weights / 2 + pad_bottom - height_weights / 2;
+    const int stride_xi   = info.stride().first;
+    const int stride_yi   = info.stride().second;
+    const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
 
     for(int r = 0; r < num_batches; ++r)
     {
-        for(int yi = start_yi; yi < end_yi; yi += stride_yi)
+        for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
         {
-            for(int xi = start_xi; xi < end_xi; xi += stride_xi)
+            for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
             {
                 for(int ofm = 0; ofm < depth_out; ++ofm)
                 {
@@ -173,6 +178,9 @@ SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor
                     const int yo         = (yi - start_yi) / stride_yi;
                     const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
 
+                    ARM_COMPUTE_ASSERT(xo < width_out);
+                    ARM_COMPUTE_ASSERT(yo < height_out);
+
                     // Compute 3D convolution
                     convolution3d(src.data() + offset_in,
                                   weights.data() + ofm * width_weights * height_weights * depth_weights,
diff --git a/tests/validation/CPP/DepthwiseConvolution.cpp b/tests/validation/CPP/DepthwiseConvolution.cpp
index ae54494c03..b57c2686f6 100644
--- a/tests/validation/CPP/DepthwiseConvolution.cpp
+++ b/tests/validation/CPP/DepthwiseConvolution.cpp
@@ -51,29 +51,35 @@ SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTe
     SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
 
     // Compute reference
-    const size_t filter_width  = weights.shape().x();
-    const size_t filter_height = weights.shape().y();
-    const size_t filter_plane  = filter_width * filter_height;
-    const size_t input_width   = src.shape().x();
-    const size_t input_height  = src.shape().y();
-    const size_t input_depth   = src.shape().z();
-    const int    num_batches   = src.shape().total_size() / (input_width * input_height * input_depth);
+    const int filter_width  = weights.shape().x();
+    const int filter_height = weights.shape().y();
+    const int filter_plane  = filter_width * filter_height;
+    const int input_width   = src.shape().x();
+    const int input_height  = src.shape().y();
+    const int input_depth   = src.shape().z();
+    const int num_batches   = src.shape().total_size() / (input_width * input_height * input_depth);
 
-    const size_t filter_half_width  = filter_width / 2;
-    const size_t filter_half_height = filter_height / 2;
-    const size_t pad_x              = std::min(filter_half_width, static_cast<size_t>(conv_info.pad().first));
-    const size_t pad_y              = std::min(filter_half_height, static_cast<size_t>(conv_info.pad().second));
-    const size_t minimum_x          = -pad_x + filter_half_width;
-    const size_t minimum_y          = -pad_y + filter_half_height;
+    const int filter_half_width  = filter_width / 2;
+    const int filter_half_height = filter_height / 2;
+
+    const int pad_left   = std::min(static_cast<int>(conv_info.pad_left()), filter_half_width);
+    const int pad_top    = std::min(static_cast<int>(conv_info.pad_top()), filter_half_height);
+    const int pad_right  = std::min(static_cast<int>(conv_info.pad_right()), filter_half_width);
+    const int pad_bottom = std::min(static_cast<int>(conv_info.pad_bottom()), filter_half_height);
+
+    const int minimum_x = -pad_left + filter_half_width;
+    const int minimum_y = -pad_top + filter_half_height;
+    const int maximum_x = input_width + pad_left - filter_half_width + pad_right - filter_half_width;
+    const int maximum_y = input_height + pad_top - filter_half_height + pad_bottom - filter_half_height;
 
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
-        for(size_t z = 0; z < input_depth; ++z)
+        for(int z = 0; z < input_depth; ++z)
         {
-            for(size_t y = minimum_y; y < input_height - minimum_y; y += conv_info.stride().second)
+            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
             {
-                for(size_t x = minimum_x; x < input_width - minimum_x; x += conv_info.stride().first)
+                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                 {
                     Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
                     size_t      filter_offset = filter_plane * z;
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 2fa3d0ea4b..de7eaf8b3d 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -599,7 +599,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PadStrideInfo &pad_s
 {
     os << pad_stride_info.stride().first << "," << pad_stride_info.stride().second;
     os << ";";
-    os << pad_stride_info.pad().first << "," << pad_stride_info.pad().second;
+    os << pad_stride_info.pad_left() << "," << pad_stride_info.pad_right() << ","
+       << pad_stride_info.pad_top() << "," << pad_stride_info.pad_bottom();
 
     return os;
 }
-- 
cgit v1.2.1