APPBROWSER-391: Fix GLES COMPUTE alignment issues

APPBROWSER-402: Performance optimization for squeezenet/xray model Change-Id: If31b186b99a6d6087164019fe94d3ac9279e3204 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119526 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Frank Lei <frank.lei@arm.com> 2018-02-01 14:47:14 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:47:40 +0000
commit: 4406fd6cc4abded564d3791324e1f48bdfd34273 (patch)
tree: 22fe402fe9ac7ca338df49e9eccd6eb1587ae875 /src/core/GLES_COMPUTE
parent: 898d399a0f62c15612a52df4bff5018e783214e4 (diff)
download: ComputeLibrary-4406fd6cc4abded564d3791324e1f48bdfd34273.tar.gz
14 files changed, 177 insertions, 93 deletions
diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
index 5576665243..19af777446 100644
--- a/src/core/GLES_COMPUTE/IGCTensor.cpp
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 using namespace arm_compute;
 
 IGCTensor::IGCTensor()
-    : _mapping(nullptr)
+    : _mapping(nullptr), _needs_shifting(false)
 {
 }
 
@@ -52,3 +52,13 @@ uint8_t *IGCTensor::buffer() const
 {
     return _mapping;
 }
+
+bool IGCTensor::needs_shifting() const
+{
+    return _needs_shifting;
+}
+
+void IGCTensor::set_needs_shifting(bool needs_shifting)
+{
+    _needs_shifting = needs_shifting;
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
index 0ff43605ba..2ab6d5eac5 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,20 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z =
 precision mediump float;
 #define ADD(x, y) (x) + (y)
 
-/** This function add two images.
+/** This function add two tensors.
  *
- * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: F16
- * @param[in]  src1_attrs The attributes of the first source image
- * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source image
- * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination image
+ * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src1_attrs The attributes of the first source tensor
+ * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src1_attrs;
-    ImageAttributes src2_attrs;
-    ImageAttributes dst_attrs;
+    Tensor3DAttributes src1_attrs;
+    Tensor3DAttributes src2_attrs;
+    Tensor3DAttributes dst_attrs;
 };
 
 TENSOR_DECLARATION(1, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
@@ -51,9 +51,9 @@ TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main(void)
 {
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
-    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4 tmp1[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
     vec4 tmp2[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src2_ptr, src2_iter);
@@ -62,4 +62,4 @@ void main(void)
     addition[1] = ADD(tmp1[1], tmp2[1]);
 
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, addition);
-}
-\ No newline at end of file
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 0c8b5bf0bb..69ac50b4d0 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,8 @@ void main(void)
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -66,7 +66,7 @@ void main(void)
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 #endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
index b2689a257d..b72c3392aa 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,23 +29,23 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z =
 // We DO have to use highp for DATA_TYPE_FP16 float here to calculate the coordinates of source tensor. float is highp by default, but we still write it down here to make it more clearly, and mediump is only used for src/dst tensor in shader body.
 precision highp float;
 
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
+/** Performs an affine transformation on an tensor interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
  *
  * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: FP16.
  * @param[in]  src_attrs    The attributes of the source tensor
  * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: FP16. (Must be the same as the input)
  * @param[in]  dst_attrs    The attributes of the destination tensor
- * @param[in]  input_width  Input image width
- * @param[in]  input_height Input image height
+ * @param[in]  input_width  Input tensor width
+ * @param[in]  input_height Input tensor height
  * @param[in]  scale        The scale factor along x/y dimension
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-    float           input_width;
-    float           input_height;
-    vec2            scale;
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    float              input_width;
+    float              input_height;
+    vec2               scale;
 };
 
 #if defined(DATA_TYPE_FP16)
@@ -75,8 +75,8 @@ vec4[2] clamp_to_border_with_size(vec4[2] coords, float width, float height, flo
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4[2] tc = clamp_to_border_with_size(transform_nearest(vec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y), scale), input_width, input_height, float(BORDER_SIZE));
 
@@ -85,7 +85,7 @@ void main()
 
     for(int i = 0; i < 4; i++)
     {
-        uint offset_in_bytes = image_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]));
+        uint offset_in_bytes = tensor3D_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]), int(gl_GlobalInvocationID.z));
 
         s = LOAD_UNPACK2_HALF(src_ptr, uint(offset_in_bytes >> src_shift));
 
@@ -107,15 +107,15 @@ TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     uvec2 tc = uvec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y >> uint(1));
 
     mediump vec4 s = vec4(0.0f);
     mediump      vec4[2] d;
 
-    s = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, int(tc[0]), int(tc[1])));
+    s = LOAD_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, int(tc[0]), int(tc[1]), int(gl_GlobalInvocationID.z)));
 
     d[0] = vec4(s.x, s.x, s.y, s.y);
     d[1] = vec4(s.z, s.z, s.w, s.w);
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index b8672c662d..d7c645d09d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,16 +109,26 @@ void GCActivationLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    if(_input == _output)
+    {
+        slice_in.shift(Window::DimX, -(_input->info()->padding()).left);
+    }
 
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _input, binding++, slice_in);
         add_3D_tensor_argument(idx, _output, binding++, slice);
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index caec324de2..06cf40990c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,18 +135,24 @@ void GCArithmeticAdditionKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_2D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1; // SSBO binding starts from 1.
-        add_2D_tensor_argument(idx, _input1, binding++, slice);
-        add_2D_tensor_argument(idx, _input2, binding++, slice);
-        add_2D_tensor_argument(idx, _output, binding++, slice);
+        add_3D_tensor_argument(idx, _input1, binding++, slice_in);
+        add_3D_tensor_argument(idx, _input2, binding++, slice_in);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index a41b62fbab..cd93f6997e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -119,7 +119,10 @@ void GCBatchNormalizationLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
 
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -130,14 +133,16 @@ void GCBatchNormalizationLayerKernel::run(const Window &window)
     add_1D_tensor_argument(idx, _beta, 5, vector_slice);
     add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
 
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 7b1848c32b..36d1b29bba 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -38,7 +38,7 @@
 using namespace arm_compute;
 
 GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
@@ -61,8 +61,9 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
 
-    _input  = input;
-    _output = output;
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
 
     // Add build options
     std::set<std::string> build_opts;
@@ -76,11 +77,8 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i
     _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
     _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
-    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
-
-    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
-    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
-    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+    build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom));
 
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
@@ -118,17 +116,24 @@ void GCDepthConcatenateLayerKernel::run(const Window &window)
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+    slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
-        add_3D_tensor_argument(idx, _output, 2, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice_out);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 28b5bd2d62..9343268d9e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,16 +173,20 @@ void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, con
     const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
 
     // Calculate input right and bottom border
-    const int input_width    = input->info()->dimension(0);
-    const int input_height   = input->info()->dimension(1);
-    const int padding_right  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + 2), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_left - input_width;
-    const int padding_bottom = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + 2), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_top - input_height;
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
+
+    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_left)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_left));
+    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_top)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_top));
+
+    const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_left;
+    const int input_padding_bottom = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_top;
 
     BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
 
     Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
 
-    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + input_padding_right, input_height + input_padding_bottom);
     AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
     AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
 
@@ -224,6 +228,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Create input window and adjust
     Window win_in = window;
     win_in.adjust(Window::DimX, -_conv_pad_left, true);
@@ -246,6 +252,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
         add_1D_tensor_argument(idx, _biases, 4, slice_biases);
     }
 
+    slice_out.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx = 0;
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 1b94626356..bef30d5042 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -394,6 +394,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Get initial windows
     Window slice  = window.first_slice_window_3D();
     Window win_in = window;
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index bc9c7eb55a..fac29024e3 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,6 +89,8 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window slice = window.first_slice_window_3D();
 
     Window slice_in;
@@ -100,15 +102,19 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
     add_1D_tensor_argument(idx, _mean, 3, slice_in);
     add_1D_tensor_argument(idx, _sd, 4, slice_in);
 
+    slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index c688cd4567..3a0944cd48 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -198,11 +198,14 @@ std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITenso
         const int output_height         = output->dimension(1);
         const int output_padding_right  = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width;
         const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
-        const int input_padding_right   = ceil_to_multiple(input_width + 2 * border_size.right, num_elems_processed_per_iteration) - (input_width + 2 * border_size.right);
-        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * border_size.bottom, 1) - (input_height + 2 * border_size.bottom);
+
+        const int input_total_width    = std::max(int(input->padding().left), int(pool_pad_x)) + input_width + std::max(int(input->padding().right), int(pool_pad_x));
+        const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_processed_per_iteration) - input_width - pool_pad_x;
+        const int input_total_height   = std::max(int(input->padding().top), int(pool_pad_y)) + input_height + std::max(int(input->padding().bottom), int(pool_pad_y));
+        const int input_padding_bottom = input_total_height - input_height - pool_pad_y;
 
         // Configure kernel window
-        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right + input_padding_right, input_height + border_size.bottom + input_padding_bottom);
+        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + input_padding_right, input_height + input_padding_bottom);
         AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
         bool               window_changed = update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -340,13 +343,19 @@ void GCPoolingLayerKernel::run(const Window &window)
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
+
+    Window slice         = window_collapsed.first_slice_window_3D();
+    Window slice_in_orig = window_collapsed.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
 
     do
     {
         // Upsample input by pool size
-        Window in_slice(slice); // NOLINT
+        Window in_slice(slice_in_orig); // NOLINT
         in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
         in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
 
@@ -358,5 +367,5 @@ void GCPoolingLayerKernel::run(const Window &window)
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_in_orig));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index f307cfb239..46d7ff9172 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,9 +128,34 @@ void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, Interpo
 
     IGCKernel::configure(win);
 
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the tensor parameters
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(0)));
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(1)));
     _kernel.set_argument<float>(idx++, wr);
     _kernel.set_argument<float>(idx++, hr);
 }
+
+void GCScaleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index c2182171a6..21946b7f8d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -39,7 +39,7 @@ using namespace arm_compute;
 using namespace arm_compute::gles_compute;
 
 GCTensorShiftKernel::GCTensorShiftKernel()
-    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
+    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)
 {
 }
 
@@ -59,18 +59,18 @@ void GCTensorShiftKernel::configure(IGCTensor *input)
     options.emplace(("#define " + dt_name));
 
     unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right;
-    unsigned int num_elems_written_per_iteration_y = 1;
-    unsigned int num_elems_written_per_iteration_z = 1;
 
     std::stringstream kernel_name;
     kernel_name << "tensorshift";
 
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
 
-    Window                 win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_written_per_iteration_x);
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_x));
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimY);
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimZ);
 
-    update_window_and_padding(win, input_access);
+    _left_padding = _input->info()->padding().left;
 
     IGCKernel::configure(win);
 }
@@ -80,6 +80,11 @@ void GCTensorShiftKernel::run(const Window &window)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
+    if(int(_left_padding) == 0 || !_input->needs_shifting())
+    {
+        return;
+    }
+
     _kernel.use();
 
     // Get initial windows
@@ -92,14 +97,7 @@ void GCTensorShiftKernel::run(const Window &window)
 
         add_3D_tensor_argument(idx, _input, 1, slice);
 
-        const PaddingSize &padding1 = _input->info()->padding();
-
-        if(int(padding1.left) == 0)
-        {
-            break;
-        }
-
-        _kernel.set_argument(idx++, static_cast<unsigned int>(padding1.left));
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_left_padding));
 
         _kernel.update_shader_params();
         enqueue(*this, slice, _lws);
author	Frank Lei <frank.lei@arm.com>	2018-02-01 14:47:14 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:47:40 +0000
commit	4406fd6cc4abded564d3791324e1f48bdfd34273 (patch)
tree	22fe402fe9ac7ca338df49e9eccd6eb1587ae875 /src/core/GLES_COMPUTE
parent	898d399a0f62c15612a52df4bff5018e783214e4 (diff)
download	ComputeLibrary-4406fd6cc4abded564d3791324e1f48bdfd34273.tar.gz