From 4406fd6cc4abded564d3791324e1f48bdfd34273 Mon Sep 17 00:00:00 2001 From: Frank Lei Date: Thu, 1 Feb 2018 14:47:14 +0800 Subject: APPBROWSER-391: Fix GLES COMPUTE alignment issues APPBROWSER-402: Performance optimization for squeezenet/xray model Change-Id: If31b186b99a6d6087164019fe94d3ac9279e3204 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119526 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- .../kernels/GCActivationLayerKernel.cpp | 18 +++++++++++--- .../kernels/GCArithmeticAdditionKernel.cpp | 18 +++++++++----- .../kernels/GCBatchNormalizationLayerKernel.cpp | 11 +++++--- .../kernels/GCDepthConcatenateLayerKernel.cpp | 29 +++++++++++++--------- .../GCDepthwiseConvolutionLayer3x3Kernel.cpp | 20 ++++++++++----- .../kernels/GCDirectConvolutionLayerKernel.cpp | 2 ++ .../kernels/GCNormalizePlanarYUVLayerKernel.cpp | 12 ++++++--- .../GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp | 21 +++++++++++----- src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp | 29 ++++++++++++++++++++-- .../GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp | 26 +++++++++---------- 10 files changed, 130 insertions(+), 56 deletions(-) (limited to 'src/core/GLES_COMPUTE/kernels') diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp index b8672c662d..d7c645d09d 100644 --- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -109,16 +109,26 @@ void GCActivationLayerKernel::run(const Window &window) _kernel.use(); - Window slice = window.first_slice_window_3D(); + _output->set_needs_shifting(true); + + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); + + if(_input == _output) + { + slice_in.shift(Window::DimX, -(_input->info()->padding()).left); + } do { unsigned int idx = 0; unsigned int binding = 1; - add_3D_tensor_argument(idx, _input, binding++, slice); + add_3D_tensor_argument(idx, _input, binding++, slice_in); add_3D_tensor_argument(idx, _output, binding++, slice); _kernel.update_shader_params(); enqueue(*this, slice); } - while(window.slide_window_slice_3D(slice)); + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp index caec324de2..06cf40990c 100644 --- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -135,18 +135,24 @@ void GCArithmeticAdditionKernel::run(const Window &window) _kernel.use(); - Window slice = window.first_slice_window_2D(); + _output->set_needs_shifting(true); + + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); + do { unsigned int idx = 0; unsigned int binding = 1; // SSBO binding starts from 1. - add_2D_tensor_argument(idx, _input1, binding++, slice); - add_2D_tensor_argument(idx, _input2, binding++, slice); - add_2D_tensor_argument(idx, _output, binding++, slice); + add_3D_tensor_argument(idx, _input1, binding++, slice_in); + add_3D_tensor_argument(idx, _input2, binding++, slice_in); + add_3D_tensor_argument(idx, _output, binding++, slice); _kernel.update_shader_params(); enqueue(*this, slice); } - while(window.slide_window_slice_2D(slice)); + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp index a41b62fbab..cd93f6997e 100644 --- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp @@ -119,7 +119,10 @@ void GCBatchNormalizationLayerKernel::run(const Window &window) _kernel.use(); - Window slice = window.first_slice_window_3D(); + _output->set_needs_shifting(true); + + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); Window vector_slice = window.first_slice_window_1D(); vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -130,14 +133,16 @@ void GCBatchNormalizationLayerKernel::run(const Window &window) add_1D_tensor_argument(idx, _beta, 5, vector_slice); add_1D_tensor_argument(idx, _gamma, 6, vector_slice); + slice.shift(Window::DimX, -(_output->info()->padding()).left); + do { idx = 0; - add_3D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _input, 1, slice_in); add_3D_tensor_argument(idx, _output, 2, slice); _kernel.update_shader_params(); enqueue(*this, slice); } - while(window.slide_window_slice_3D(slice)); + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp index 7b1848c32b..36d1b29bba 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp @@ -38,7 +38,7 @@ using namespace arm_compute; GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel() - : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) { } @@ -61,8 +61,9 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); - _input = input; - _output = output; + _input = input; + _output = output; + _depth_offset = depth_offset; // Add build options std::set build_opts; @@ -76,11 +77,8 @@ void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned i _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2]; - - build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right)); - build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom)); - build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes)); + build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right)); + build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom)); // Create kernel _kernel = static_cast(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); @@ -118,17 +116,24 @@ void GCDepthConcatenateLayerKernel::run(const Window &window) _kernel.use(); - Window slice = window.first_slice_window_3D(); + _output->set_needs_shifting(true); + + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + Window slice_out = window.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); + slice_out.set(Window::DimZ, Window::Dimension(_depth_offset)); do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, 1, slice); - add_3D_tensor_argument(idx, _output, 2, slice); + add_3D_tensor_argument(idx, _input, 1, slice_in); + add_3D_tensor_argument(idx, _output, 2, slice_out); _kernel.update_shader_params(); enqueue(*this, slice); } - while(window.slide_window_slice_3D(slice)); + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp index 28b5bd2d62..9343268d9e 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -173,16 +173,20 @@ void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, con const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height; // Calculate input right and bottom border - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int padding_right = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + 2), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_left - input_width; - const int padding_bottom = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + 2), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_top - input_height; + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + + const int input_total_width = std::max(int(input->info()->padding().left), int(_conv_pad_left)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_left)); + const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_top)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_top)); + + const int input_padding_right = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_left; + const int input_padding_bottom = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_top; BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0); Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border); - AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + padding_right, input_height + padding_bottom); + AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + input_padding_right, input_height + input_padding_bottom); AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0); AccessWindowStatic bias_access = AccessWindowStatic(nullptr, 0, 0, 0, 1); @@ -224,6 +228,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window) _kernel.use(); + _output->set_needs_shifting(true); + // Create input window and adjust Window win_in = window; win_in.adjust(Window::DimX, -_conv_pad_left, true); @@ -246,6 +252,8 @@ void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window) add_1D_tensor_argument(idx, _biases, 4, slice_biases); } + slice_out.shift(Window::DimX, -(_output->info()->padding()).left); + do { unsigned int idx = 0; diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp index 1b94626356..bef30d5042 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp @@ -394,6 +394,8 @@ void GCDirectConvolutionLayerKernel::run(const Window &window) _kernel.use(); + _output->set_needs_shifting(true); + // Get initial windows Window slice = window.first_slice_window_3D(); Window win_in = window; diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp index bc9c7eb55a..fac29024e3 100644 --- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -89,6 +89,8 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window) _kernel.use(); + _output->set_needs_shifting(true); + Window slice = window.first_slice_window_3D(); Window slice_in; @@ -100,15 +102,19 @@ void GCNormalizePlanarYUVLayerKernel::run(const Window &window) add_1D_tensor_argument(idx, _mean, 3, slice_in); add_1D_tensor_argument(idx, _sd, 4, slice_in); + slice_in = window.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); + do { idx = 0; - add_3D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _input, 1, slice_in); add_3D_tensor_argument(idx, _output, 2, slice); _kernel.update_shader_params(); enqueue(*this, slice); } - while(window.slide_window_slice_3D(slice)); + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp index c688cd4567..3a0944cd48 100644 --- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp @@ -198,11 +198,14 @@ std::tuple validate_and_configure_window(ITenso const int output_height = output->dimension(1); const int output_padding_right = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width; const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height; - const int input_padding_right = ceil_to_multiple(input_width + 2 * border_size.right, num_elems_processed_per_iteration) - (input_width + 2 * border_size.right); - const int input_padding_bottom = ceil_to_multiple(input_height + 2 * border_size.bottom, 1) - (input_height + 2 * border_size.bottom); + + const int input_total_width = std::max(int(input->padding().left), int(pool_pad_x)) + input_width + std::max(int(input->padding().right), int(pool_pad_x)); + const int input_padding_right = ceil_to_multiple(input_total_width, num_elems_processed_per_iteration) - input_width - pool_pad_x; + const int input_total_height = std::max(int(input->padding().top), int(pool_pad_y)) + input_height + std::max(int(input->padding().bottom), int(pool_pad_y)); + const int input_padding_bottom = input_total_height - input_height - pool_pad_y; // Configure kernel window - AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right + input_padding_right, input_height + border_size.bottom + input_padding_bottom); + AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + input_padding_right, input_height + input_padding_bottom); AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom); bool window_changed = update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); @@ -340,13 +343,19 @@ void GCPoolingLayerKernel::run(const Window &window) _kernel.use(); + _output->set_needs_shifting(true); + Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); - Window slice = window_collapsed.first_slice_window_3D(); + + Window slice = window_collapsed.first_slice_window_3D(); + Window slice_in_orig = window_collapsed.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); do { // Upsample input by pool size - Window in_slice(slice); // NOLINT + Window in_slice(slice_in_orig); // NOLINT in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration)); in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y)); @@ -358,5 +367,5 @@ void GCPoolingLayerKernel::run(const Window &window) _kernel.update_shader_params(); enqueue(*this, slice); } - while(window_collapsed.slide_window_slice_3D(slice)); + while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_in_orig)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp index f307cfb239..46d7ff9172 100644 --- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2018 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -128,9 +128,34 @@ void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, Interpo IGCKernel::configure(win); - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the tensor parameters + unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the tensor parameters _kernel.set_argument(idx++, static_cast(input->info()->dimension(0))); _kernel.set_argument(idx++, static_cast(input->info()->dimension(1))); _kernel.set_argument(idx++, wr); _kernel.set_argument(idx++, hr); } + +void GCScaleKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.use(); + + _output->set_needs_shifting(true); + + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + + slice.shift(Window::DimX, -(_output->info()->padding()).left); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, 1, slice_in); + add_3D_tensor_argument(idx, _output, 2, slice); + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp index c2182171a6..21946b7f8d 100644 --- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp @@ -39,7 +39,7 @@ using namespace arm_compute; using namespace arm_compute::gles_compute; GCTensorShiftKernel::GCTensorShiftKernel() - : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)) + : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0) { } @@ -59,18 +59,18 @@ void GCTensorShiftKernel::configure(IGCTensor *input) options.emplace(("#define " + dt_name)); unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right; - unsigned int num_elems_written_per_iteration_y = 1; - unsigned int num_elems_written_per_iteration_z = 1; std::stringstream kernel_name; kernel_name << "tensorshift"; _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name.str(), options)); - Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_written_per_iteration_x); + Window win; + win.set(Window::DimX, Window::Dimension(0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_x)); + win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimY); + win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimZ); - update_window_and_padding(win, input_access); + _left_padding = _input->info()->padding().left; IGCKernel::configure(win); } @@ -80,6 +80,11 @@ void GCTensorShiftKernel::run(const Window &window) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + if(int(_left_padding) == 0 || !_input->needs_shifting()) + { + return; + } + _kernel.use(); // Get initial windows @@ -92,14 +97,7 @@ void GCTensorShiftKernel::run(const Window &window) add_3D_tensor_argument(idx, _input, 1, slice); - const PaddingSize &padding1 = _input->info()->padding(); - - if(int(padding1.left) == 0) - { - break; - } - - _kernel.set_argument(idx++, static_cast(padding1.left)); + _kernel.set_argument(idx++, static_cast(_left_padding)); _kernel.update_shader_params(); enqueue(*this, slice, _lws); -- cgit v1.2.1