From 780db4eb6a9e3dee565d14f36d772038cd3253da Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 23 Nov 2017 09:49:51 +0000 Subject: COMPMID-471 Implement Deconvolution on OpenCL Change-Id: Ie00c6b08a51d30c5ce2637d40ee3d165b8a68686 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110311 Reviewed-by: Pablo Tello Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- src/core/CL/CLKernelLibrary.cpp | 7 +- src/core/CL/cl_kernels/deconvolution_layer.cl | 50 +++++++ .../kernels/CLDeconvolutionLayerUpsampleKernel.cpp | 117 +++++++++++++++ .../kernels/NEDeconvolutionLayerUpsampleKernel.cpp | 165 --------------------- src/core/Utils.cpp | 32 ++-- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 132 +++++++++++++++++ .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 64 ++++++++ .../NEON/functions/NEDeconvolutionLayer.cpp | 105 ++++++------- .../functions/NEDeconvolutionLayerUpsample.cpp | 121 --------------- 9 files changed, 433 insertions(+), 360 deletions(-) create mode 100644 src/core/CL/cl_kernels/deconvolution_layer.cl create mode 100644 src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp delete mode 100644 src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp create mode 100644 src/runtime/CL/functions/CLDeconvolutionLayer.cpp create mode 100644 src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp delete mode 100644 src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp (limited to 'src') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index de75518a05..352b89baa5 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -186,6 +186,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "copy_plane", "channel_extract.cl" }, { "copy_planes_3p", "channel_combine.cl" }, { "copy_to_keypoint", "fast_corners.cl" }, + { "deconvolution_upsample", "deconvolution_layer.cl" }, { "depthwise_convolution_3x3", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" }, { "depthwise_im2col", "depthwise_convolution.cl" }, @@ -419,6 +420,10 @@ const std::map CLKernelLibrary::_program_source_map = { "convolution_rectangle.cl", #include "./cl_kernels/convolution_rectangle.clembed" + }, + { + "deconvolution_layer.cl", +#include "./cl_kernels/deconvolution_layer.clembed" }, { "depth_convert.cl", diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl new file mode 100644 index 0000000000..2514ddc8cc --- /dev/null +++ b/src/core/CL/cl_kernels/deconvolution_layer.cl @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function applies upsample on an input image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void deconvolution_upsample( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Store result + *((__global float *)dst.ptr) = *((__global float *)src.ptr); +} diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp new file mode 100644 index 0000000000..5c08d5bee2 --- /dev/null +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() + : _input(nullptr), _output(nullptr), _inner_border(), _info() +{ +} + +Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, "inner_border_right must be smaller that stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, "inner_border_top must be smaller that stride_y"); + + return Status{}; +} + +void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _inner_border = inner_border; + _info = info; + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), inner_border, info)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("deconvolution_upsample")); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal output_access(output->info(), 0, 0, num_elems_processed_per_iteration); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const int out_start_x = _info.pad().first; + const int out_end_x = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1; + const int out_step_x = _info.stride().first; + + const int out_start_y = _inner_border.top + _info.pad().second; + const int out_end_y = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1; + const int out_step_y = _info.stride().second; + + Window slice_out = window.first_slice_window_2D(); + slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_2D(slice_out)); +} diff --git a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp b/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp deleted file mode 100644 index 71db2e9782..0000000000 --- a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include -#include - -using namespace arm_compute; - -NEDeconvolutionLayerUpsampleKernel::NEDeconvolutionLayerUpsampleKernel() - : _offsets(nullptr), _input(nullptr), _output(nullptr) -{ -} - -BorderSize NEDeconvolutionLayerUpsampleKernel::border_size() const -{ - return BorderSize(1); -} - -void NEDeconvolutionLayerUpsampleKernel::configure(const ITensor *input, const ITensor *offsets, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0); - - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); - } - - _input = input; - _output = output; - _offsets = offsets; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - const int border_offset = border_size().left; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset); - AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, offsets_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -void NEDeconvolutionLayerUpsampleKernel::scale_nearest(const Window &window) -{ - const size_t input_stride = _input->info()->strides_in_bytes()[1]; - - // Compute the ratio between source height and destination height - const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_off; - win_off.set(Window::DimX, window[Window::DimX]); - win_off.set(Window::DimY, window[Window::DimY]); - - for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(_input, win_in); - Iterator out(_output, window); - Iterator offsets(_offsets, win_off); - - switch(_input->info()->data_type()) - { - case DataType::F32: - { - float32x4x4_t tmp = - { - { - vdupq_n_f32(0), - vdupq_n_f32(0) - } - }; - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - - const size_t in_yi = (id.y() + 0.5f) * hr; - const size_t offset_row = in_yi * input_stride; - - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3); - - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3); - - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3); - - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3); - - vst4q_f32(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, out); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } -} - -void NEDeconvolutionLayerUpsampleKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - scale_nearest(window); -} diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 76d0b0f059..a8249c4840 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016, 2017, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -261,29 +261,17 @@ TensorShape arm_compute::deconvolution_output_shape(const std::pair arm_compute::deconvolution_output_dimensions( unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady, - unsigned int ax, unsigned int ay, float upscalex, float upscaley, DimensionRoundingType round) + unsigned int inner_border_right, unsigned int inner_border_top, unsigned int stride_x, unsigned int stride_y) { ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); - ARM_COMPUTE_ERROR_ON(((in_width - 1) * upscalex + kernel_width + ax) < 2.f * padx); - ARM_COMPUTE_ERROR_ON(((in_height - 1) * upscaley + kernel_height + ay) < 2.f * pady); - const float fw = (in_width - 1) * upscalex - 2.f * padx + kernel_width + ax; - const float fh = (in_height - 1) * upscaley - 2.f * pady + kernel_height + ay; - int w = 0; - int h = 0; - switch(round) - { - case DimensionRoundingType::FLOOR: - w = std::floor(fw); - h = std::floor(fh); - break; - case DimensionRoundingType::CEIL: - w = std::ceil(fw); - h = std::ceil(fh); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } + ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width + inner_border_right) < 2 * padx); + ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height + inner_border_top) < 2 * pady); + const int padx_deconv = (kernel_width - padx - 1); + const int pady_deconv = (kernel_height - pady - 1); + ARM_COMPUTE_ERROR_ON(padx_deconv < 0); + ARM_COMPUTE_ERROR_ON(pady_deconv < 0); + const int w = stride_x * (in_width - 1) + kernel_width + inner_border_right - 2 * padx_deconv; + const int h = stride_y * (in_height - 1) + kernel_height + inner_border_top - 2 * pady_deconv; return std::make_pair(w, h); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp new file mode 100644 index 0000000000..1c55722344 --- /dev/null +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include +#include + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _scaled_output() +{ +} + +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int inner_border_right, unsigned int inner_border_top) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1); + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y"); + + auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1), + info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y); + + const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top, + info))); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info)); + + return Status{}; +} + +void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + unsigned int inner_border_right, unsigned int inner_border_top) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1), + info.pad().first, info.pad().second, inner_border_top, inner_border_right, stride_x, stride_y); + + const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape()); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top)); + + _memory_group.manage(&_scaled_output); + + // configure scale function + // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor + TensorShape scale_out_shape(input->info()->tensor_shape()); + const unsigned int out_x = input->info()->dimension(0) + (input->info()->dimension(0) - 1) * (stride_x - 1) + inner_border_right + 2 * info.pad().first; + const unsigned int out_y = input->info()->dimension(1) + (input->info()->dimension(1) - 1) * (stride_y - 1) + inner_border_top + 2 * info.pad().second; + scale_out_shape.set(0, out_x); + scale_out_shape.set(1, out_y); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + _scaled_output.allocator()->init(scale_out_info); + + _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), info); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, weights, bias, output, conv_info); + _scaled_output.allocator()->allocate(); +} + +void CLDeconvolutionLayer::run() +{ + _memory_group.acquire(); + _scale_f.run(); + _conv_f.run(); + _memory_group.release(); +} diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp new file mode 100644 index 0000000000..13a24f8ba4 --- /dev/null +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017, 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" + +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include +#include +#include + +using namespace arm_compute; + +CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT + : _upsample(), + _output(nullptr) +{ +} + +Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + return CLDeconvolutionLayerUpsampleKernel::validate(input, output, inner_border, info); +} + +void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _output = output; + _upsample.configure(input, _output, inner_border, info); +} + +void CLDeconvolutionLayerUpsample::run() +{ + _output->map(CLScheduler::get().queue(), true); + memset(_output->buffer(), 0, _output->info()->total_size()); + _output->unmap(CLScheduler::get().queue()); + + CLScheduler::get().enqueue(_upsample, false); +} diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 7b4e77b296..c4bca11d14 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,38 +24,41 @@ #include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memory_manager) // NOLINT : _memory_group(std::move(memory_manager)), - _scale_f(), _conv_f(), - _scaled_output() + _scaled_output(), + _input(nullptr), + _info(), + _inner_border() { } void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, - unsigned int ax, unsigned int ay, float upscalex, float upscaley) + unsigned int inner_border_right, unsigned int inner_border_top) { ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1), - info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round()); + _input = input; + _info = info; + _inner_border = std::make_pair(inner_border_right, inner_border_top); - const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape()); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1), + info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias); + const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape()); ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); @@ -64,51 +67,51 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _memory_group.manage(&_scaled_output); // configure scale function - //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor - TensorShape scale_out_shape(input->info()->tensor_shape()); - scale_out_shape.set(0, output->info()->dimension(0)); - scale_out_shape.set(1, output->info()->dimension(1)); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor + const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type(), + input->info()->fixed_point_position()); _scaled_output.allocator()->init(scale_out_info); - const unsigned int kernel_size = weights->info()->dimension(0); - // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding - ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1)); - const unsigned int tr_px = kernel_size - info.pad().first - 1; - const unsigned int tr_py = kernel_size - info.pad().second - 1; - const unsigned int tr_stride = 1; - const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py); - _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info); + // setup the function to convolve the upscaled output - switch(kernel_size) - { - case 1: - { - _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)); - break; - } - case 3: - { - _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)); - break; - } - case 5: - { - _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL)); - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); - break; - } - } + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, weights, bias, output, conv_info); _scaled_output.allocator()->allocate(); } void NEDeconvolutionLayer::run() { _memory_group.acquire(); - _scale_f.run(); + + // Initialize _scaled_output buffer + const int width_in = _input->info()->dimension(0); + const int height_in = _input->info()->dimension(1); + const int width_scaled = _scaled_output.info()->dimension(0); + const int height_scaled = _scaled_output.info()->dimension(1); + const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in); + const int stride_x = _info.stride().first; + const int stride_y = _info.stride().second; + + std::fill_n(reinterpret_cast(_scaled_output.buffer()), _scaled_output.info()->tensor_shape().total_size(), 0.f); + + // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output + // and insert rows and columns with zeroes depending on the stride values. + for(int slice = 0; slice < num_2d_slices; ++slice) + { + const int start_x = _info.pad().first; + const int start_y = _inner_border.second + _info.pad().second; + const int end_y = height_scaled - _info.pad().second; + const int end_x = width_scaled - _inner_border.first - _info.pad().first; + + for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++) + { + for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++) + { + const auto in = *(reinterpret_cast(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice)))); + *(reinterpret_cast(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in; + } + } + } + _conv_f.run(); _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp deleted file mode 100644 index 63f17bcb5a..0000000000 --- a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h" - -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/ToolchainSupport.h" - -#include -#include -#include - -using namespace arm_compute; - -namespace -{ -inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair &a, - const std::pair &iz, const PadStrideInfo &info) -{ - ARM_COMPUTE_ERROR_ON(nullptr == offsets); - Window win; - const int padx = info.pad().first; - const int pady = info.pad().second; - const int ax = a.first; - const int ay = a.second; - const int offset_width = offsets->info()->dimension(0); - const int offset_height = offsets->info()->dimension(1); - // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image. - // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1). - // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets - for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second)) - { - for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first)) - { - int *ptr = reinterpret_cast(offsets->ptr_to_element(Coordinates(xi, yi))); - const size_t in_xi = (xi + 0.5f) * wr; - *reinterpret_cast(ptr) = in_xi * input_element_size; - } - } -} -} // namespace - -NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _offsets(), - _border_handler(), - _upsample() -{ -} - -void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair &a, - const std::pair &iz, const PadStrideInfo &info) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); - } - - // Get the tensor shape - const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1)); - - // Compute the ratio between source width/height and destination width/height - const auto wr = static_cast(input->info()->dimension(0)) / static_cast(output->info()->dimension(0)); - const auto hr = static_cast(input->info()->dimension(1)) / static_cast(output->info()->dimension(1)); - ARM_COMPUTE_UNUSED(hr); - // Get the element size of the input image - const size_t input_element_size = input->info()->element_size(); - - TensorInfo tensor_info_offsets(shape, Format::S32); - _offsets.allocator()->init(tensor_info_offsets); - - _upsample.configure(input, &_offsets, output); - - // Allocate once the configure methods have been called - _offsets.allocator()->allocate(); - // Pre-compute offsets for nearest interpolation - std::fill_n(reinterpret_cast(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size); - precompute_offsets(&_offsets, wr, input_element_size, a, iz, info); - - _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0)); -} - -void NEDeconvolutionLayerUpsample::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - _memory_group.acquire(); - NEScheduler::get().schedule(&_upsample, Window::DimY); - _memory_group.release(); -} -- cgit v1.2.1