From 780db4eb6a9e3dee565d14f36d772038cd3253da Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 23 Nov 2017 09:49:51 +0000 Subject: COMPMID-471 Implement Deconvolution on OpenCL Change-Id: Ie00c6b08a51d30c5ce2637d40ee3d165b8a68686 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110311 Reviewed-by: Pablo Tello Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- src/core/CL/CLKernelLibrary.cpp | 7 +- src/core/CL/cl_kernels/deconvolution_layer.cl | 50 +++++++ .../kernels/CLDeconvolutionLayerUpsampleKernel.cpp | 117 +++++++++++++++ .../kernels/NEDeconvolutionLayerUpsampleKernel.cpp | 165 --------------------- src/core/Utils.cpp | 32 ++-- 5 files changed, 183 insertions(+), 188 deletions(-) create mode 100644 src/core/CL/cl_kernels/deconvolution_layer.cl create mode 100644 src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp delete mode 100644 src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp (limited to 'src/core') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index de75518a05..352b89baa5 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -186,6 +186,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "copy_plane", "channel_extract.cl" }, { "copy_planes_3p", "channel_combine.cl" }, { "copy_to_keypoint", "fast_corners.cl" }, + { "deconvolution_upsample", "deconvolution_layer.cl" }, { "depthwise_convolution_3x3", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" }, { "depthwise_im2col", "depthwise_convolution.cl" }, @@ -419,6 +420,10 @@ const std::map CLKernelLibrary::_program_source_map = { "convolution_rectangle.cl", #include "./cl_kernels/convolution_rectangle.clembed" + }, + { + "deconvolution_layer.cl", +#include "./cl_kernels/deconvolution_layer.clembed" }, { "depth_convert.cl", diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl new file mode 100644 index 0000000000..2514ddc8cc --- /dev/null +++ b/src/core/CL/cl_kernels/deconvolution_layer.cl @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function applies upsample on an input image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void deconvolution_upsample( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Store result + *((__global float *)dst.ptr) = *((__global float *)src.ptr); +} diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp new file mode 100644 index 0000000000..5c08d5bee2 --- /dev/null +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() + : _input(nullptr), _output(nullptr), _inner_border(), _info() +{ +} + +Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, "inner_border_right must be smaller that stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, "inner_border_top must be smaller that stride_y"); + + return Status{}; +} + +void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _inner_border = inner_border; + _info = info; + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), inner_border, info)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("deconvolution_upsample")); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal output_access(output->info(), 0, 0, num_elems_processed_per_iteration); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const int out_start_x = _info.pad().first; + const int out_end_x = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1; + const int out_step_x = _info.stride().first; + + const int out_start_y = _inner_border.top + _info.pad().second; + const int out_end_y = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1; + const int out_step_y = _info.stride().second; + + Window slice_out = window.first_slice_window_2D(); + slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_2D(slice_out)); +} diff --git a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp b/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp deleted file mode 100644 index 71db2e9782..0000000000 --- a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include - -using namespace arm_compute; - -NEDeconvolutionLayerUpsampleKernel::NEDeconvolutionLayerUpsampleKernel() - : _offsets(nullptr), _input(nullptr), _output(nullptr) -{ -} - -BorderSize NEDeconvolutionLayerUpsampleKernel::border_size() const -{ - return BorderSize(1); -} - -void NEDeconvolutionLayerUpsampleKernel::configure(const ITensor *input, const ITensor *offsets, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0); - - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); - } - - _input = input; - _output = output; - _offsets = offsets; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - const int border_offset = border_size().left; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset); - AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, offsets_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -void NEDeconvolutionLayerUpsampleKernel::scale_nearest(const Window &window) -{ - const size_t input_stride = _input->info()->strides_in_bytes()[1]; - - // Compute the ratio between source height and destination height - const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_off; - win_off.set(Window::DimX, window[Window::DimX]); - win_off.set(Window::DimY, window[Window::DimY]); - - for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(_input, win_in); - Iterator out(_output, window); - Iterator offsets(_offsets, win_off); - - switch(_input->info()->data_type()) - { - case DataType::F32: - { - float32x4x4_t tmp = - { - { - vdupq_n_f32(0), - vdupq_n_f32(0) - } - }; - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - - const size_t in_yi = (id.y() + 0.5f) * hr; - const size_t offset_row = in_yi * input_stride; - - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3); - - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3); - - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3); - - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3); - - vst4q_f32(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, out); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } -} - -void NEDeconvolutionLayerUpsampleKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - scale_nearest(window); -} diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 76d0b0f059..a8249c4840 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016, 2017, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -261,29 +261,17 @@ TensorShape arm_compute::deconvolution_output_shape(const std::pair arm_compute::deconvolution_output_dimensions( unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady, - unsigned int ax, unsigned int ay, float upscalex, float upscaley, DimensionRoundingType round) + unsigned int inner_border_right, unsigned int inner_border_top, unsigned int stride_x, unsigned int stride_y) { ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); - ARM_COMPUTE_ERROR_ON(((in_width - 1) * upscalex + kernel_width + ax) < 2.f * padx); - ARM_COMPUTE_ERROR_ON(((in_height - 1) * upscaley + kernel_height + ay) < 2.f * pady); - const float fw = (in_width - 1) * upscalex - 2.f * padx + kernel_width + ax; - const float fh = (in_height - 1) * upscaley - 2.f * pady + kernel_height + ay; - int w = 0; - int h = 0; - switch(round) - { - case DimensionRoundingType::FLOOR: - w = std::floor(fw); - h = std::floor(fh); - break; - case DimensionRoundingType::CEIL: - w = std::ceil(fw); - h = std::ceil(fh); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } + ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width + inner_border_right) < 2 * padx); + ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height + inner_border_top) < 2 * pady); + const int padx_deconv = (kernel_width - padx - 1); + const int pady_deconv = (kernel_height - pady - 1); + ARM_COMPUTE_ERROR_ON(padx_deconv < 0); + ARM_COMPUTE_ERROR_ON(pady_deconv < 0); + const int w = stride_x * (in_width - 1) + kernel_width + inner_border_right - 2 * padx_deconv; + const int h = stride_y * (in_height - 1) + kernel_height + inner_border_top - 2 * pady_deconv; return std::make_pair(w, h); } -- cgit v1.2.1