COMPMID-2592 Create a new kernel for CLPad with SYMMETRIC and REFLECT mode

Change-Id: Icaf0516f490b2ddca6d1ea03a5cf26cc7d43041f Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/1872 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Giorgio Arena <giorgio.arena@arm.com> 2019-08-28 17:55:07 +0100
committer: Giorgio Arena <giorgio.arena@arm.com> 2019-10-01 09:29:33 +0000
commit: 5c4a8e96460eb83a6caef1c69ea5cbb4893858d7 (patch)
tree: e9d78cf087455995434f29bae070e5f7a9dda292
parent: 04ea4e88af1bdf9cf34436f7302c73e15f7bd4a5 (diff)
download: ComputeLibrary-5c4a8e96460eb83a6caef1c69ea5cbb4893858d7.tar.gz
7 files changed, 378 insertions, 350 deletions
diff --git a/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/arm_compute/core/CL/kernels/CLPadLayerKernel.h
index 7e4b31cfda..ab6ad1762d 100644
--- a/arm_compute/core/CL/kernels/CLPadLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPadLayerKernel.h
@@ -31,7 +31,7 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the PadLayer function. Only CONSTANT PaddingMode is currently supported*/
+/** Interface for the PadLayer function. */
 class CLPadLayerKernel : public ICLKernel
 {
 public:
@@ -56,7 +56,6 @@ public:
      * @param[in]  constant_value (Optional) Constant value to be used for the padding.
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     *                            Only CONSTANT mode is currently supported.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
@@ -68,7 +67,6 @@ public:
      * @param[in] constant_value (Optional) Constant value to be used for the padding.
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     *                            Only CONSTANT mode is currently supported.
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
 
@@ -80,6 +78,7 @@ private:
     ICLTensor       *_output;
     int              _input_start_x;
     int              _input_start_y;
+    bool             _4d_enabled;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLPADLAYERKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 58e0cabe63..fd801eae46 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -27,18 +27,16 @@
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 #include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to pad a tensor. This function calls the following OpenCL kernels:
+/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
  *
- *  -# @ref CLMemsetKernel
- *  -# @ref CLFillBorderKernel
- *  -# @ref CLCopyKernel
+ *  -# @ref CLPadLayerKernel if there is padding to be added
+ *  -# @ref CLCopyKernel otherwise
  */
 class CLPadLayer : public IFunction
 {
@@ -62,8 +60,7 @@ public:
      *                            specifies the front and the end padding in the i-th dimension.
      * @param[in]  constant_value (Optional) Constant value to be used for the padding.
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). Only CONSTANT
-     *                              is currently supported.
+     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
     void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
 
@@ -75,8 +72,7 @@ public:
      *                           specifies the front and the end padding in the i-th dimension.
      * @param[in] constant_value (Optional) Constant value to be used for the padding
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). Only CONSTANT
-     *                              is currently supported.
+     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
 
@@ -84,18 +80,11 @@ public:
     void run() override;
 
 private:
-    void configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value);
-    void configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output);
+    void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
-    CLPadLayerKernel                _pad_kernel;
-    CLCopyKernel                    _copy_kernel;
-    PaddingMode                     _mode;
-    PaddingList                     _padding;
-    size_t                          _num_dimensions;
-    std::vector<CLStridedSlice>     _slice_functions;
-    std::vector<CLConcatenateLayer> _concat_functions;
-    std::vector<CLTensor>           _slice_results;
-    std::vector<CLTensor>           _concat_results;
+    CLPadLayerKernel _pad_kernel;
+    CLCopyKernel     _copy_kernel;
+    bool             _perform_pad;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_PADLAYER_H__ */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a5e75df8be..978e35fef6 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -413,7 +413,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
-    { "pad_layer", "pad_layer.cl" },
+    { "pad_layer_constant", "pad_layer.cl" },
+    { "pad_layer_symmetric_reflect", "pad_layer.cl" },
     { "permute", "permute.cl" },
     { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 6f51b87bc6..f7f208529a 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -49,6 +49,69 @@
 
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
 
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+
 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)
 
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
index fac97d25d9..ae2af468a8 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/pad_layer.cl
@@ -23,44 +23,32 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(CONST_VAL)
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
 
 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_SELECT(x) CONVERT(x, VEC_DATA_TYPE(SELECT_DT, VEC_SIZE))
-
-#if VEC_SIZE == 1
-#define OFFSETS (int)0
-#elif VEC_SIZE == 2
-#define OFFSETS (int2)(0, 1)
-#elif VEC_SIZE == 4
-#define OFFSETS (int4)(0, 1, 2, 3)
-#elif VEC_SIZE == 8
-#define OFFSETS (int8)(0, 1, 2, 3, 4, 5, 6, 7)
-#elif VEC_SIZE == 16
-#define OFFSETS (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-#else // VEC_SIZE
-#error "Only 1, 2, 3, 4, 8 and 16 vector sizes allowed"
-#endif // VEC_SIZE
-
-/** Perform a pad operation
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
  *
  * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
- * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
- * @note Pad to add to the left must be passed using the -DPAD_LEFT compile flag, e.g. -DPAD_LEFT=5
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
  * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
  * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
- * @note In case pad left is more than the vector size, the number of threads to skil alond the X axis must be passed using the
- *       -DTHREADS_TO_SKIP_X compile flag, e.g. -DTHREADS_TO_SKIP_X=1. This is defined as (PAD_LEFT / VEC_SIZE)
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
+ *       -DNUM_THREADS_TO_SKIP_X compile flag, e.g. -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
  * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_TOP: Pad to add to the top of the input tensor (e.g. -DPAD_TOP=3)
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
  *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
  * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_NEAR: Pad to add before the first plane of the input tensor (e.g. -DPAD_NEAR=3)
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
  *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
  * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_BTOP: Pad to add before the first batch of the input tensor (e.g. -DPAD_BTOP=3)
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
  *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
  *
  * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
@@ -79,68 +67,187 @@
  * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
  */
-__kernel void pad_layer(TENSOR3D_DECLARATION(src),
-                        TENSOR3D_DECLARATION(dst),
-                        uint batch)
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                 ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+                                )
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     const int z = get_global_id(2);
 
-#if defined(PAD_BTOP) || defined(PAD_NEAR)
     uint cond = 0;
-#if defined(PAD_BTOP)
-    cond |= batch < PAD_BTOP || batch >= (PAD_BTOP + SRC_BATCH);
-#endif // defined(PAD_BTOP)
 
-#if defined(PAD_NEAR)
-    cond |= z < PAD_NEAR || z >= (PAD_NEAR + SRC_DEPTH);
-#endif // defined(PAD_NEAR)
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
     if(cond)
     {
         Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-        VSTORE(VEC_SIZE)((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+        VSTORE(VEC_SIZE)
+        ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
     }
     else
     {
-#endif // defined(PAD_BTOP) || defined(PAD_NEAR)
-
         Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
         Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
-#if defined(THREADS_TO_SKIP_X)
+#if defined(NUM_THREADS_TO_SKIP_X)
         /* In case the pad left is greater than the vector size, and we are past the threads operating solely on pad values,
          * the input pointer must be brought back along the X axis to start from the first non-pad values.
          *
-         * E.g. with VEC_SIZE=2, PAD_LEFT=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
-         *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) < PAD_LEFT
-         *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, 3)) < PAD_LEFT
+         * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+         *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) < PAD_X_BEFORE
+         *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, 3)) < PAD_X_BEFORE
          *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x * VEC_SIZE) == 4) values, reading |4 5|
-         *  -# To detect this, we use ((PAD_LEFT / VEC_SIZE) == THREADS_TO_SKIP_X == 2) and check that it is >= to the current x
-         *  -# So, we bring the pointer back of THREADS_TO_SKIP_X threads, which means multiplying this constant by the input's step along the X axis
-         *  -# Now that the pointer is back of ((THREADS_TO_SKIP_X * src_step_x) == 4) values, it will read the desired values |0 1|
+         *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and check that it is >= to the current x
+         *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying this constant by the input's step along the X axis
+         *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it will read the desired values |0 1|
          */
-        src.ptr -= select(0u, THREADS_TO_SKIP_X * src_step_x, x >= THREADS_TO_SKIP_X);
-#endif // defined(THREADS_TO_SKIP_X)
-#if defined(PAD_NEAR)
-        src.ptr -= PAD_NEAR * src_step_z;
-#endif // defined(PAD_NEAR)
-#if defined(PAD_BTOP)
-        src.ptr -= PAD_BTOP * SRC_DEPTH * src_step_z;
-#endif // defined(PAD_BTOP)
+        src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+        src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+        src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
 
         VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
 
-        VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + OFFSETS;
-        VEC_INT cond   = xs_out < (VEC_INT)PAD_LEFT || xs_out >= (VEC_INT)(PAD_LEFT + SRC_WIDTH);
-#if defined(PAD_TOP)
-        cond |= (VEC_INT)y < (VEC_INT)PAD_TOP || (VEC_INT)y >= (VEC_INT)(PAD_TOP + SRC_HEIGHT);
-#endif // defined(PAD_TOP)
+        VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+        VEC_INT cond   = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+        cond |= (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
         VSTORE(VEC_SIZE)
-        (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT_SELECT(cond)), 0, (__global DATA_TYPE *)dst.ptr);
-#if defined(PAD_NEAR) || defined(PAD_BTOP)
+        (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0, (__global DATA_TYPE *)dst.ptr);
     }
-#endif // defined(PAD_NEAR) || defined(PAD_BTOP)
 }
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(CONST_VAL)
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
+                                          TENSOR3D_DECLARATION(dst))
+{
+    // Get current thread position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Define conditions based on the thread X position w.r.t. pad left and right
+    const int x_out_first         = x * VEC_SIZE;
+    const int x_out_last          = x_out_first + VEC_SIZE;
+    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
+    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+    // Calculate base pointers
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Calculate input tensor's offset based on the defined conditions
+    int x_offset = 0;
+    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+    int neg_offs = x_offset < 0;
+    x_offset     = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+    // Load input values from the computed offset
+    int y_in = y;
+    int z_in = z;
+#if defined(PAD_Y_BEFORE)
+    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+    // Choose rearrangement policy based on the defined conditions
+    src_vals = select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals = select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+    src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+    // Store
+    VSTORE(VEC_SIZE)
+    (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 9dfd380f7c..3d951a930c 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -32,27 +32,51 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
-    ARM_COMPUTE_UNUSED(input, output, constant_value);
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only CONSTANT mode supported.");
+    ARM_COMPUTE_UNUSED(constant_value);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
+
+        const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+        for(size_t i = 0; i < padding.size(); ++i)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
+            ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
+        }
+    }
+
+    if(output->total_size() > 0)
+    {
+        TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+    }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode,
+                                                        unsigned int &num_elems_processed_per_iteration)
 {
     ARM_COMPUTE_UNUSED(constant_value, mode);
-    // Output auto initialization if not yet initialized
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
 
-    const unsigned int num_elems_processed_per_iteration = std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+    num_elems_processed_per_iteration = std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+    if(input->dimension(0) < num_elems_processed_per_iteration)
+    {
+        num_elems_processed_per_iteration = 1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
+    }
 
     // Configure kernel window
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    const int input_start_x = -(padding.at(0).first % num_elems_processed_per_iteration);
-    const int input_start_y = padding.size() > 1 ? -padding.at(1).first : 0;
+    const int input_start_x = mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
+    const int input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
 
     AccessWindowRectangle  input_access(input, input_start_x, input_start_y, num_elems_processed_per_iteration, 1);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
@@ -66,7 +90,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0)
+    : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
 {
 }
 
@@ -76,54 +100,101 @@ void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, cons
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    _input  = input;
-    _output = output;
+    _input      = input;
+    _output     = output;
+    _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
+
+    // Configure window
+    unsigned int vec_size;
+    auto         win_config = validate_and_configure_window(input->info(), output->info(), padding, constant_value, mode, vec_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
 
     // Set build options
-    const unsigned int num_elems_processed_per_iteration = std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type())));
-    _input_start_x                                       = -(padding.at(0).first % num_elems_processed_per_iteration);
-    _input_start_y                                       = padding.size() > 1 ? -padding.at(1).first : 0;
+    std::string kernel_name = "pad_layer_";
 
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, input->info()->data_type()));
-    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(padding.at(0).first));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option_if(padding.at(0).first > num_elems_processed_per_iteration, "-DTHREADS_TO_SKIP_X=" + support::cpp11::to_string(padding.at(0).first / num_elems_processed_per_iteration));
+    const DataType    &data_type       = input->info()->data_type();
+    const unsigned int input_width     = input->info()->dimension(0);
+    const unsigned int input_height    = input->info()->dimension(1);
+    const unsigned int input_depth     = input->info()->dimension(2);
+    const unsigned int pad_x_before    = padding.at(0).first;
+    const unsigned int pad_y_before    = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before    = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int pad_right_start = input_width + pad_x_before;
 
+    _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
+    _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     if(padding.size() > 1)
     {
-        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(padding.at(1).first));
-        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
         if(padding.size() > 2)
         {
-            build_opts.add_option("-DPAD_NEAR=" + support::cpp11::to_string(padding.at(2).first));
-            build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+            build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
+            build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
+        }
+    }
+
+    switch(mode)
+    {
+        case PaddingMode::CONSTANT:
+        {
+            kernel_name += "constant";
 
-            if(padding.size() > 3)
+            build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
+            build_opts.add_option_if(pad_x_before >= vec_size, "-DNUM_THREADS_TO_SKIP_X=" + support::cpp11::to_string(pad_x_before / vec_size));
+
+            if(_4d_enabled)
             {
-                build_opts.add_option("-DPAD_BTOP=" + support::cpp11::to_string(padding.at(3).first));
+                build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
             }
+
+            break;
         }
+        case PaddingMode::SYMMETRIC:
+        case PaddingMode::REFLECT:
+        {
+            kernel_name += "symmetric_reflect";
+
+            const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+
+            const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+            const unsigned int pad_x_after_remainder  = pad_right_start % vec_size;
+            const unsigned int after_pad_fact_x       = (2 * input_width + pad_x_before) - is_reflect;
+            const unsigned int output_last_x          = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+
+            build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Padding mode not supported.");
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pad_layer", build_opts.options()));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), padding, constant_value, mode);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 }
 
 Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
+    unsigned int vec_size;
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), padding, constant_value, mode).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), padding, constant_value, mode, vec_size).first);
 
     return Status{};
 }
@@ -145,7 +216,10 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
         add_3D_tensor_argument(idx, _output, slice_out);
-        add_argument<unsigned int>(idx, batch++);
+        if(_4d_enabled)
+        {
+            add_argument<unsigned int>(idx, batch++);
+        }
 
         enqueue(queue, *this, slice_out, lws_hint());
     }
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 88b1b77a0d..8f36a69866 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,155 +23,25 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _pad_kernel(), _copy_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _pad_kernel(), _copy_kernel(), _perform_pad(false)
 {
 }
-void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
-{
-    int64_t last_padding_dimension = _padding.size() - 1;
-    // Reflecting can be performed by effectively unfolding the input as follows:
-    // For each dimension starting at DimX:
-    //      Create a before and after slice, which values depend on the selected padding mode
-    //      Concatenate the before and after padding with the tensor to be padded
-
-    // Two strided slice functions will be required for each dimension padded as well as a
-    // concatenate function and the tensors to hold the temporary results.
-    _slice_functions.resize(2 * _num_dimensions);
-    _slice_results.resize(2 * _num_dimensions);
-    _concat_functions.resize(_num_dimensions);
-    _concat_results.resize(_num_dimensions - 1);
-
-    Coordinates starts_before{};
-    Coordinates ends_before{};
-    Coordinates starts_after{};
-    Coordinates ends_after{};
-    Coordinates strides{};
-    ICLTensor *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
-    {
-        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
-        {
-            strides.set(i - 1, 1);
-        }
-
-        if(_padding[i].first > 0 || _padding[i].second > 0)
-        {
-            // Set the starts, ends, and strides values for the current dimension.
-            // Due to the bit masks passed to strided slice, the values below the current dimension in
-            // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
-            {
-                starts_before.set(i, _padding[i].first);
-                ends_before.set(i, 0);
-                starts_after.set(i, input->info()->dimension(i) - 2);
-                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
-                strides.set(i, -1);
-            }
-            else
-            {
-                starts_before.set(i, _padding[i].first - 1);
-                ends_before.set(i, -1);
-                starts_after.set(i, input->info()->dimension(i) - 1);
-                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
-                strides.set(i, -1);
-            }
-
-            // Strided slice wraps negative indexes around to the end of the range,
-            // instead this should indicate use of the full range and so the bit mask will be modified.
-            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
-
-            // Reflect the input values for the padding before and after the input.
-            std::vector<ICLTensor *> concat_vector;
-            if(_padding[i].first > 0)
-            {
-                if(i < prev->info()->num_dimensions())
-                {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
-                    concat_vector.push_back(&_slice_results[2 * i]);
-                }
-                else
-                {
-                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
-                    concat_vector.push_back(prev);
-                }
-            }
-            concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
-            {
-                if(i < prev->info()->num_dimensions())
-                {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
-                    concat_vector.push_back(&_slice_results[2 * i + 1]);
-                }
-                else
-                {
-                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
-                    concat_vector.push_back(prev);
-                }
-            }
-            // Concatenate the padding before and after with the input.
-            ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
-            _concat_functions[i].configure(concat_vector, out, i);
-            prev = out;
-        }
-    }
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
-    {
-        if((static_cast<int32_t>(i) != last_padding_dimension))
-        {
-            _concat_results[i].allocator()->allocate();
-        }
-        _slice_results[2 * i].allocator()->allocate();
-        _slice_results[2 * i + 1].allocator()->allocate();
-    }
-}
 
 void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
-    _padding = padding;
-    _mode    = mode;
-
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
 
-    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
-    int64_t last_padding_dimension = _padding.size() - 1;
-    for(; last_padding_dimension >= 0; --last_padding_dimension)
+    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
     {
-        if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
-        {
-            break;
-        }
-    }
-    _num_dimensions = last_padding_dimension + 1;
-    if(_num_dimensions > 0)
+        return info.first > 0 || info.second > 0;
+    });
+
+    if(_perform_pad)
     {
-        switch(_mode)
-        {
-            case PaddingMode::CONSTANT:
-            {
-                _pad_kernel.configure(input, output, padding, constant_value, mode);
-                break;
-            }
-            case PaddingMode::REFLECT:
-            case PaddingMode::SYMMETRIC:
-            {
-                configure_reflect_symmetric_mode(input, output);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Padding mode not supported.");
-        }
+        _pad_kernel.configure(input, output, padding, constant_value, mode);
     }
     else
     {
@@ -179,109 +49,34 @@ void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingLis
         _copy_kernel.configure(input, output);
     }
 }
-
 Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
-
-    // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
-    PaddingList padding_extended = padding;
-    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
     {
-        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
-    }
+        return info.first > 0 || info.second > 0;
+    });
 
-    Window copy_window = Window();
-    for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+    if(perform_pad)
     {
-        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
-    }
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), &copy_window));
-    }
-
-    switch(mode)
-    {
-        case PaddingMode::CONSTANT:
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
-            break;
-        }
-        case PaddingMode::REFLECT:
-        case PaddingMode::SYMMETRIC:
-        {
-            for(uint32_t i = 0; i < padding.size(); ++i)
-            {
-                if(mode == PaddingMode::REFLECT)
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
-                }
-                else
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
-                }
-            }
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid mode");
-        }
+        Window copy_window = Window();
+        copy_window.use_tensor_dimensions(output->tensor_shape());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
     }
     return Status{};
 }
-
 void CLPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if(_perform_pad)
     {
-        switch(_mode)
-        {
-            case PaddingMode::CONSTANT:
-            {
-                CLScheduler::get().enqueue(_pad_kernel, false);
-                break;
-            }
-            case PaddingMode::REFLECT:
-            case PaddingMode::SYMMETRIC:
-            {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
-                {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
-                    {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
-                        {
-                            _slice_functions[2 * i].run();
-                        }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
-                        {
-                            _slice_functions[2 * i + 1].run();
-                        }
-                        CLScheduler::get().sync();
-                        _concat_functions[i].run();
-                        CLScheduler::get().sync();
-                    }
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Padding mode not supported.");
-        }
+        CLScheduler::get().enqueue(_pad_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_copy_kernel, true);
+        CLScheduler::get().enqueue(_copy_kernel);
     }
 }
-} // namespace arm_compute
+} // namespace arm_compute
+\ No newline at end of file
author	Giorgio Arena <giorgio.arena@arm.com>	2019-08-28 17:55:07 +0100
committer	Giorgio Arena <giorgio.arena@arm.com>	2019-10-01 09:29:33 +0000
commit	5c4a8e96460eb83a6caef1c69ea5cbb4893858d7 (patch)
tree	e9d78cf087455995434f29bae070e5f7a9dda292
parent	04ea4e88af1bdf9cf34436f7302c73e15f7bd4a5 (diff)
download	ComputeLibrary-5c4a8e96460eb83a6caef1c69ea5cbb4893858d7.tar.gz