APPBROWSER-391: Fix GLES COMPUTE alignment issues

APPBROWSER-402: Performance optimization for squeezenet/xray model Change-Id: If31b186b99a6d6087164019fe94d3ac9279e3204 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119526 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Frank Lei <frank.lei@arm.com> 2018-02-01 14:47:14 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:47:40 +0000
commit: 4406fd6cc4abded564d3791324e1f48bdfd34273 (patch)
tree: 22fe402fe9ac7ca338df49e9eccd6eb1587ae875
parent: 898d399a0f62c15612a52df4bff5018e783214e4 (diff)
download: ComputeLibrary-4406fd6cc4abded564d3791324e1f48bdfd34273.tar.gz
24 files changed, 280 insertions, 120 deletions
diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
index ab4e57e0ce..7329864b85 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCTensor.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,6 +75,18 @@ public:
      */
     virtual GLuint gc_buffer() const = 0;
 
+    /** Flag indicating whether the tensor has been left aligned by a kernel and therefore needs shifting.
+     *
+     * @return True if the tensor is left aligned.
+     */
+    bool needs_shifting() const;
+    /** Set the flag indicating whether or not a tensor needs shifting.
+     *
+     * @param[in] needs_shifting Indicates if the tensor is left aligned or not.
+     *
+     */
+    void set_needs_shifting(bool needs_shifting);
+
 protected:
     /** Method to be implemented by the child class to map the SSBO.
      *
@@ -92,6 +104,7 @@ protected:
 
 private:
     uint8_t *_mapping;
+    bool     _needs_shifting;
 };
 
 using IGCImage = IGCTensor;
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
index ce220cc564..06a54dd0b3 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,6 +71,7 @@ private:
     IGCTensor       *_output;
     int              _top_bottom;
     int              _left_right;
+    int              _depth_offset;
 };
 }
 #endif /* __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
index 3f936f6cb9..bdbc50c735 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef __ARM_COMPUTE_GCSCALEKERNEL_H__
 #define __ARM_COMPUTE_GCSCALEKERNEL_H__
 
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -32,7 +32,7 @@ namespace arm_compute
 class IGCTensor;
 
 /** Interface for the scale kernel */
-class GCScaleKernel : public IGCSimple2DKernel
+class GCScaleKernel : public IGCSimple3DKernel
 {
 public:
     /** Initialise the kernel's inputs, output and interpolation policy
@@ -47,6 +47,7 @@ public:
     void configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
 
     // Inherited methods overridden:
+    void run(const Window &window) override;
     BorderSize border_size() const override;
 };
 } // namespace arm_compute
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
index 5f108764b4..452caae146 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
@@ -78,6 +78,7 @@ public:
 private:
     IGCTensor    *_input;
     gles::NDRange _lws;
+    int           _left_padding;
 };
 }
 #endif /*__ARM_COMPUTE_GCTENSORSHIFTKERNEL_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
index 7b99ea5645..e523356fca 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,10 @@
 #define __ARM_COMPUTE_GCDEPTHWISECONVOLUTION_H__
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -38,9 +40,11 @@ class IGCTensor;
  * -# @ref GCFillBorderKernel (if pad_x or pad_y > 0)
  *
  */
-class GCDepthwiseConvolutionLayer3x3 : public IGCSimpleFunction
+class GCDepthwiseConvolutionLayer3x3 : public IFunction
 {
 public:
+    /** Default constructor */
+    GCDepthwiseConvolutionLayer3x3();
     /** Initialize the function's source, destination, conv and border_size.
      *
      * @param[in, out] input     Source tensor. Data type supported: F16. (Written to only for border filling).
@@ -51,6 +55,14 @@ public:
      * @param[in]      conv_info Padding and stride information to use for the convolution.
      */
     void configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override final;
+
+private:
+    std::unique_ptr<IGCKernel> _kernel;
+    GCFillBorderKernel         _border_handler;
+    GCTensorShiftKernel        _shift_handler;
 };
 }
 #endif /*__ARM_COMPUTE_GCDEPTHWISECONVOLUTION_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
index c6b948be1f..976aee7b83 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
@@ -53,16 +53,17 @@ public:
     GCDirectConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in,out] input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: F16/F32.
+     *                          input will be written to only if it is currently left aligned.
+     * @param[in]     weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]     biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out]    output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]     conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info);
+    void configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
     void run() override final;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
index 5733542a72..6f1dbb05bc 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,11 @@
 #ifndef __ARM_COMPUTE_GCPOOLINGLAYER_H__
 #define __ARM_COMPUTE_GCPOOLINGLAYER_H__
 
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
@@ -38,9 +42,10 @@ class IGCTensor;
  * -# @ref GCFillBorderKernel (executed if padding size is different from zero)
  * -# @ref GCPoolingLayerKernel
  */
-class GCPoolingLayer : public IGCSimpleFunction
+class GCPoolingLayer : public IFunction
 {
 public:
+    GCPoolingLayer();
     /** Set the input and output tensors.
      *
      * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16/F32.
@@ -57,6 +62,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    void run() override final;
+
+private:
+    std::unique_ptr<IGCKernel> _kernel;
+    GCFillBorderKernel         _border_handler;
+    GCTensorShiftKernel        _shift_handler;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_GCPOOLINGLAYER_H__ */
diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
index 5576665243..19af777446 100644
--- a/src/core/GLES_COMPUTE/IGCTensor.cpp
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 using namespace arm_compute;
 
 IGCTensor::IGCTensor()
-    : _mapping(nullptr)
+    : _mapping(nullptr), _needs_shifting(false)
 {
 }
 
@@ -52,3 +52,13 @@ uint8_t *IGCTensor::buffer() const
 {
     return _mapping;
 }
+
+bool IGCTensor::needs_shifting() const
+{
+    return _needs_shifting;
+}
+
+void IGCTensor::set_needs_shifting(bool needs_shifting)
+{
+    _needs_shifting = needs_shifting;
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
index 0ff43605ba..2ab6d5eac5 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,20 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z =
 precision mediump float;
 #define ADD(x, y) (x) + (y)
 
-/** This function add two images.
+/** This function add two tensors.
  *
- * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: F16
- * @param[in]  src1_attrs The attributes of the first source image
- * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source image
- * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination image
+ * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src1_attrs The attributes of the first source tensor
+ * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src1_attrs;
-    ImageAttributes src2_attrs;
-    ImageAttributes dst_attrs;
+    Tensor3DAttributes src1_attrs;
+    Tensor3DAttributes src2_attrs;
+    Tensor3DAttributes dst_attrs;
 };
 
 TENSOR_DECLARATION(1, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
@@ -51,9 +51,9 @@ TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main(void)
 {
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
-    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4 tmp1[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
     vec4 tmp2[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src2_ptr, src2_iter);
@@ -62,4 +62,4 @@ void main(void)
     addition[1] = ADD(tmp1[1], tmp2[1]);
 
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, addition);
-}
-\ No newline at end of file
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 0c8b5bf0bb..69ac50b4d0 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,8 @@ void main(void)
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -66,7 +66,7 @@ void main(void)
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 #endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
index b2689a257d..b72c3392aa 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,23 +29,23 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z =
 // We DO have to use highp for DATA_TYPE_FP16 float here to calculate the coordinates of source tensor. float is highp by default, but we still write it down here to make it more clearly, and mediump is only used for src/dst tensor in shader body.
 precision highp float;
 
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
+/** Performs an affine transformation on an tensor interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
  *
  * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: FP16.
  * @param[in]  src_attrs    The attributes of the source tensor
  * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: FP16. (Must be the same as the input)
  * @param[in]  dst_attrs    The attributes of the destination tensor
- * @param[in]  input_width  Input image width
- * @param[in]  input_height Input image height
+ * @param[in]  input_width  Input tensor width
+ * @param[in]  input_height Input tensor height
  * @param[in]  scale        The scale factor along x/y dimension
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-    float           input_width;
-    float           input_height;
-    vec2            scale;
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    float              input_width;
+    float              input_height;
+    vec2               scale;
 };
 
 #if defined(DATA_TYPE_FP16)
@@ -75,8 +75,8 @@ vec4[2] clamp_to_border_with_size(vec4[2] coords, float width, float height, flo
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4[2] tc = clamp_to_border_with_size(transform_nearest(vec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y), scale), input_width, input_height, float(BORDER_SIZE));
 
@@ -85,7 +85,7 @@ void main()
 
     for(int i = 0; i < 4; i++)
     {
-        uint offset_in_bytes = image_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]));
+        uint offset_in_bytes = tensor3D_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]), int(gl_GlobalInvocationID.z));
 
         s = LOAD_UNPACK2_HALF(src_ptr, uint(offset_in_bytes >> src_shift));
 
@@ -107,15 +107,15 @@ TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     uvec2 tc = uvec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y >> uint(1));
 
     mediump vec4 s = vec4(0.0f);
     mediump      vec4[2] d;
 
-    s = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, int(tc[0]), int(tc[1])));
+    s = LOAD_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, int(tc[0]), int(tc[1]), int(gl_GlobalInvocationID.z)));
 
     d[0] = vec4(s.x, s.x, s.y, s.y);
     d[1] = vec4(s.z, s.z, s.w, s.w);
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index b8672c662d..d7c645d09d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,16 +109,26 @@ void GCActivationLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    if(_input == _output)
+    {
+        slice_in.shift(Window::DimX, -(_input->info()->padding()).left);
+    }
 
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _input, binding++, slice_in);
         add_3D_tensor_argument(idx, _output, binding++, slice);
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index caec324de2..06cf40990c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,18 +135,24 @@ void GCArithmeticAdditionKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_2D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1; // SSBO binding starts from 1.
-        add_2D_tensor_argument(idx, _input1, binding++, slice);
-        add_2D_tensor_argument(idx, _input2, binding++, slice);
-        add_2D_tensor_argument(idx, _output, binding++, slice);
+        add_3D_tensor_argument(idx, _input1, binding++, slice_in);
+        add_3D_tensor_argument(idx, _input2, binding++, slice_in);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index a41b62fbab..cd93f6997e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -119,7 +119,10 @@ void GCBatchNormalizationLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
 
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -130,14 +133,16 @@ void GCBatchNormalizationLayerKernel::run(const Window &window)
     add_1D_tensor_argument(idx, _beta, 5, vector_slice);
     add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
 
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 7b1848c32b..36d1b29bba 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -38,7 +38,7 @@
 using namespace arm_compute;
 
 GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
@@ -61,8 +61,9 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
 
-    _input  = input;
-    _output = output;
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
 
     // Add build options
     std::set<std::string> build_opts;
@@ -76,11 +77,8 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i
     _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
     _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
-    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
-
-    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
-    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
-    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+    build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom));
 
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
@@ -118,17 +116,24 @@ void GCDepthConcatenateLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+    slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
-        add_3D_tensor_argument(idx, _output, 2, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice_out);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 28b5bd2d62..9343268d9e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,16 +173,20 @@ void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, con
     const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
 
     // Calculate input right and bottom border
-    const int input_width    = input->info()->dimension(0);
-    const int input_height   = input->info()->dimension(1);
-    const int padding_right  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + 2), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_left - input_width;
-    const int padding_bottom = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + 2), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_top - input_height;
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
+
+    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_left)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_left));
+    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_top)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_top));
+
+    const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_left;
+    const int input_padding_bottom = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_top;
 
     BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
 
     Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
 
-    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + input_padding_right, input_height + input_padding_bottom);
     AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
     AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
 
@@ -224,6 +228,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Create input window and adjust
     Window win_in = window;
     win_in.adjust(Window::DimX, -_conv_pad_left, true);
@@ -246,6 +252,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
         add_1D_tensor_argument(idx, _biases, 4, slice_biases);
     }
 
+    slice_out.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx = 0;
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 1b94626356..bef30d5042 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -394,6 +394,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Get initial windows
     Window slice  = window.first_slice_window_3D();
     Window win_in = window;
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index bc9c7eb55a..fac29024e3 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,6 +89,8 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window slice = window.first_slice_window_3D();
 
     Window slice_in;
@@ -100,15 +102,19 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
     add_1D_tensor_argument(idx, _mean, 3, slice_in);
     add_1D_tensor_argument(idx, _sd, 4, slice_in);
 
+    slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index c688cd4567..3a0944cd48 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -198,11 +198,14 @@ std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITenso
         const int output_height         = output->dimension(1);
         const int output_padding_right  = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width;
         const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
-        const int input_padding_right   = ceil_to_multiple(input_width + 2 * border_size.right, num_elems_processed_per_iteration) - (input_width + 2 * border_size.right);
-        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * border_size.bottom, 1) - (input_height + 2 * border_size.bottom);
+
+        const int input_total_width    = std::max(int(input->padding().left), int(pool_pad_x)) + input_width + std::max(int(input->padding().right), int(pool_pad_x));
+        const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_processed_per_iteration) - input_width - pool_pad_x;
+        const int input_total_height   = std::max(int(input->padding().top), int(pool_pad_y)) + input_height + std::max(int(input->padding().bottom), int(pool_pad_y));
+        const int input_padding_bottom = input_total_height - input_height - pool_pad_y;
 
         // Configure kernel window
-        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right + input_padding_right, input_height + border_size.bottom + input_padding_bottom);
+        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + input_padding_right, input_height + input_padding_bottom);
         AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
         bool               window_changed = update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -340,13 +343,19 @@ void GCPoolingLayerKernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
+
+    Window slice         = window_collapsed.first_slice_window_3D();
+    Window slice_in_orig = window_collapsed.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
 
     do
     {
         // Upsample input by pool size
-        Window in_slice(slice); // NOLINT
+        Window in_slice(slice_in_orig); // NOLINT
         in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
         in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
 
@@ -358,5 +367,5 @@ void GCPoolingLayerKernel::run(const Window &window)
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_in_orig));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index f307cfb239..46d7ff9172 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,9 +128,34 @@ void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, Interpo
 
     IGCKernel::configure(win);
 
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the tensor parameters
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(0)));
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(1)));
     _kernel.set_argument<float>(idx++, wr);
     _kernel.set_argument<float>(idx++, hr);
 }
+
+void GCScaleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index c2182171a6..21946b7f8d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -39,7 +39,7 @@ using namespace arm_compute;
 using namespace arm_compute::gles_compute;
 
 GCTensorShiftKernel::GCTensorShiftKernel()
-    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
+    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)
 {
 }
 
@@ -59,18 +59,18 @@ void GCTensorShiftKernel::configure(IGCTensor *input)
     options.emplace(("#define " + dt_name));
 
     unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right;
-    unsigned int num_elems_written_per_iteration_y = 1;
-    unsigned int num_elems_written_per_iteration_z = 1;
 
     std::stringstream kernel_name;
     kernel_name << "tensorshift";
 
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
 
-    Window                 win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_written_per_iteration_x);
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_x));
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimY);
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimZ);
 
-    update_window_and_padding(win, input_access);
+    _left_padding = _input->info()->padding().left;
 
     IGCKernel::configure(win);
 }
@@ -80,6 +80,11 @@ void GCTensorShiftKernel::run(const Window &window)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
+    if(int(_left_padding) == 0 || !_input->needs_shifting())
+    {
+        return;
+    }
+
     _kernel.use();
 
     // Get initial windows
@@ -92,14 +97,7 @@ void GCTensorShiftKernel::run(const Window &window)
 
         add_3D_tensor_argument(idx, _input, 1, slice);
 
-        const PaddingSize &padding1 = _input->info()->padding();
-
-        if(int(padding1.left) == 0)
-        {
-            break;
-        }
-
-        _kernel.set_argument(idx++, static_cast<unsigned int>(padding1.left));
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_left_padding));
 
         _kernel.update_shader_params();
         enqueue(*this, slice, _lws);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index ef65989f40..9cba37110b 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,11 @@
 
 using namespace arm_compute;
 
+GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
+
 void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
@@ -38,4 +43,15 @@ void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor
 
     // Configure border handler
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    _shift_handler.configure(input);
+}
+
+void GCDepthwiseConvolutionLayer3x3::run()
+{
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
 }
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index 769733ca66..a2607d4c2d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -33,12 +33,13 @@
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+
 GCDirectConvolutionLayer::GCDirectConvolutionLayer()
     : _kernel(nullptr), _border_handler(), _shift_handler()
 {
 }
 
-void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     int kernel_size = weights->info()->dimension(0);
 
@@ -68,14 +69,14 @@ void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor
 
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
 
-    _shift_handler.configure(output);
+    _shift_handler.configure(input);
 }
 
 void GCDirectConvolutionLayer::run()
 {
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(*_kernel);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_shift_handler);
 }
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index ff03effd3f..dcbb39d87d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,17 @@
 
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
+GCPoolingLayer::GCPoolingLayer()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
+
 void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
@@ -39,9 +46,20 @@ void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const Poolin
     // Configure border depending on operation required
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
+
+    _shift_handler.configure(input);
 }
 
 Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     return GCPoolingLayerKernel::validate(input, output, pool_info);
-}
-\ No newline at end of file
+}
+
+void GCPoolingLayer::run()
+{
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
+}
author	Frank Lei <frank.lei@arm.com>	2018-02-01 14:47:14 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:47:40 +0000
commit	4406fd6cc4abded564d3791324e1f48bdfd34273 (patch)
tree	22fe402fe9ac7ca338df49e9eccd6eb1587ae875
parent	898d399a0f62c15612a52df4bff5018e783214e4 (diff)
download	ComputeLibrary-4406fd6cc4abded564d3791324e1f48bdfd34273.tar.gz