From 04a8f8c4994f1c32b3f16a832c0e6f2599364c02 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Thu, 23 Nov 2017 11:45:24 +0000 Subject: COMPMID-692 Consistent names for the interfaces Change-Id: I4b1f3f0da9ff5342c7de7083736fe91871d14e5b Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110351 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Georgios Pinitas Reviewed-by: Anthony Barbier --- src/core/CL/kernels/CLDepthConcatenateKernel.cpp | 139 ------ .../CL/kernels/CLDepthConcatenateLayerKernel.cpp | 139 ++++++ src/core/CL/kernels/CLDepthConvertKernel.cpp | 117 ----- src/core/CL/kernels/CLDepthConvertLayerKernel.cpp | 117 +++++ .../CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp | 219 --------- .../CLDepthwiseConvolutionLayer3x3Kernel.cpp | 219 +++++++++ src/core/CL/kernels/CLL2NormalizeKernel.cpp | 110 ----- src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp | 110 +++++ .../kernels/GCDepthConcatenateKernel.cpp | 143 ------ .../kernels/GCDepthConcatenateLayerKernel.cpp | 143 ++++++ src/core/NEON/kernels/NEDepthConcatenateKernel.cpp | 170 ------- .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 170 +++++++ src/core/NEON/kernels/NEDepthConvertKernel.cpp | 524 --------------------- .../NEON/kernels/NEDepthConvertLayerKernel.cpp | 524 +++++++++++++++++++++ .../kernels/NEDepthwiseConvolution3x3Kernel.cpp | 186 -------- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 186 ++++++++ src/core/NEON/kernels/NEL2NormalizeKernel.cpp | 126 ----- src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 126 +++++ src/graph/operations/CLSimpleOperations.cpp | 10 +- src/graph/operations/NESimpleOperations.cpp | 10 +- src/runtime/CL/functions/CLDepthConcatenate.cpp | 78 --- .../CL/functions/CLDepthConcatenateLayer.cpp | 78 +++ src/runtime/CL/functions/CLDepthConvert.cpp | 38 -- src/runtime/CL/functions/CLDepthConvertLayer.cpp | 38 ++ .../CL/functions/CLDepthwiseConvolution.cpp | 138 ------ .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 138 ++++++ src/runtime/CL/functions/CLL2Normalize.cpp | 63 --- src/runtime/CL/functions/CLL2NormalizeLayer.cpp | 63 +++ src/runtime/CL/functions/CLLaplacianPyramid.cpp | 2 +- .../GLES_COMPUTE/functions/GCDepthConcatenate.cpp | 69 --- .../functions/GCDepthConcatenateLayer.cpp | 69 +++ src/runtime/NEON/functions/NEDepthConcatenate.cpp | 74 --- .../NEON/functions/NEDepthConcatenateLayer.cpp | 74 +++ src/runtime/NEON/functions/NEDepthConvert.cpp | 38 -- src/runtime/NEON/functions/NEDepthConvertLayer.cpp | 38 ++ .../NEON/functions/NEDepthwiseConvolution.cpp | 126 ----- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 126 +++++ src/runtime/NEON/functions/NEL2Normalize.cpp | 57 --- src/runtime/NEON/functions/NEL2NormalizeLayer.cpp | 57 +++ src/runtime/NEON/functions/NELaplacianPyramid.cpp | 2 +- 40 files changed, 2427 insertions(+), 2427 deletions(-) delete mode 100644 src/core/CL/kernels/CLDepthConcatenateKernel.cpp create mode 100644 src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp delete mode 100644 src/core/CL/kernels/CLDepthConvertKernel.cpp create mode 100644 src/core/CL/kernels/CLDepthConvertLayerKernel.cpp delete mode 100644 src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp create mode 100644 src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLL2NormalizeKernel.cpp create mode 100644 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp delete mode 100644 src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp delete mode 100644 src/core/NEON/kernels/NEDepthConcatenateKernel.cpp create mode 100644 src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp delete mode 100644 src/core/NEON/kernels/NEDepthConvertKernel.cpp create mode 100644 src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp delete mode 100644 src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp delete mode 100644 src/core/NEON/kernels/NEL2NormalizeKernel.cpp create mode 100644 src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp delete mode 100644 src/runtime/CL/functions/CLDepthConcatenate.cpp create mode 100644 src/runtime/CL/functions/CLDepthConcatenateLayer.cpp delete mode 100644 src/runtime/CL/functions/CLDepthConvert.cpp create mode 100644 src/runtime/CL/functions/CLDepthConvertLayer.cpp delete mode 100644 src/runtime/CL/functions/CLDepthwiseConvolution.cpp create mode 100644 src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp delete mode 100644 src/runtime/CL/functions/CLL2Normalize.cpp create mode 100644 src/runtime/CL/functions/CLL2NormalizeLayer.cpp delete mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp create mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp delete mode 100644 src/runtime/NEON/functions/NEDepthConcatenate.cpp create mode 100644 src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp delete mode 100644 src/runtime/NEON/functions/NEDepthConvert.cpp create mode 100644 src/runtime/NEON/functions/NEDepthConvertLayer.cpp delete mode 100644 src/runtime/NEON/functions/NEDepthwiseConvolution.cpp create mode 100644 src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp delete mode 100644 src/runtime/NEON/functions/NEL2Normalize.cpp create mode 100644 src/runtime/NEON/functions/NEL2NormalizeLayer.cpp (limited to 'src') diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp deleted file mode 100644 index edfbf829ed..0000000000 --- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -#include - -using namespace arm_compute; - -CLDepthConcatenateKernel::CLDepthConcatenateKernel() - : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) -{ -} - -BorderSize CLDepthConcatenateKernel::border_size() const -{ - return BorderSize(_top_bottom, _left_right); -} - -void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) -{ - static std::map> configs_map = - { - { 1, { "uchar", 16 } }, - { 2, { "ushort", 8 } }, - { 4, { "uint", 4 } }, - { 8, { "ulong", 2 } }, - }; - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); - ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end()); - - // The gaps between the two lowest dimensions of input and output need to be divisible by 2 - // Otherwise it is not clear how the padding should be added onto the input tensor - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); - - _input = input; - _output = output; - _depth_offset = depth_offset; - - // Add build options - auto config = configs_map.find(static_cast(input->info()->element_size())); - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + config->second.first)); - build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second))); - - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); - - // Configure kernel window - _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; - _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); - const unsigned int num_rows_read_per_iteration = 1; - - // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); - - AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - ICLKernel::configure(win); -} - -void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_3D(); - - const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2]; - - unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters - const cl_int3 offsets = - { - { - static_cast(_left_right), - static_cast(_top_bottom), - static_cast(offset_to_first_elements_in_bytes), - } - }; - _kernel.setArg(idx, offsets); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } - while(window.slide_window_slice_3D(slice)); -} diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp new file mode 100644 index 0000000000..0275d4fd83 --- /dev/null +++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) +{ +} + +BorderSize CLDepthConcatenateLayerKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) +{ + static std::map> configs_map = + { + { 1, { "uchar", 16 } }, + { 2, { "ushort", 8 } }, + { 4, { "uint", 4 } }, + { 8, { "ulong", 2 } }, + }; + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end()); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + _depth_offset = depth_offset; + + // Add build options + auto config = configs_map.find(static_cast(input->info()->element_size())); + std::set build_opts; + build_opts.emplace(("-DDATA_TYPE=" + config->second.first)); + build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second))); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); + + // Configure kernel window + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); + + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2]; + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters + const cl_int3 offsets = + { + { + static_cast(_left_right), + static_cast(_top_bottom), + static_cast(offset_to_first_elements_in_bytes), + } + }; + _kernel.setArg(idx, offsets); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp deleted file mode 100644 index b2132073d5..0000000000 --- a/src/core/CL/kernels/CLDepthConvertKernel.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" - -#include -#include -#include - -using namespace arm_compute; - -void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16, - DataType::U16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16, - DataType::U16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON(input == output); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different"); - ARM_COMPUTE_ERROR_ON(shift >= 8); - - // Check if convertion is supported - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32, - "Only data types supported [in] QS8 -> [out] F32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32), - "Only data types supported [in] QS16 -> [out] F32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16), - "Only data types supported [in] F32 -> [out] QS8, QS16"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16 - && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32), - "Only data types supported [in] U8 -> [out] U16, S16, U32, S32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 - && output->info()->data_type() != DataType::S32), - "Only data types supported [in] U16 -> [out] U8, U32, S32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 - && output->info()->data_type() != DataType::S32), - "Only data types supported [in] S16 -> [out] U8, U32, S32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 - && output->info()->data_type() != DataType::S16), - "Only data types supported [in] U32 -> [out] U8, U16, S16"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 - && output->info()->data_type() != DataType::S16), - "Only data types supported [in] S32 -> [out] U8, U16, S16"); - - // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - // Get data sizes - const size_t input_size = data_size_from_type(input->info()->data_type()); - const size_t output_size = data_size_from_type(output->info()->data_type()); - - // Construct kernel name and build options - std::string kernel_name = "convert_depth"; - std::set build_opts; - if(input_size > output_size) - { - kernel_name += "_down"; - // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined - build_opts.insert(((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); - } - else - { - kernel_name += "_up"; - } - build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); - } - - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); - - // Set shift arg - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, shift); - - // Configure kernel - constexpr unsigned int num_elems_processed_per_iteration = 16; - ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); -} diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp new file mode 100644 index 0000000000..83908a1469 --- /dev/null +++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16, + DataType::U16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16, + DataType::U16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON(input == output); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different"); + ARM_COMPUTE_ERROR_ON(shift >= 8); + + // Check if convertion is supported + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32, + "Only data types supported [in] QS8 -> [out] F32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32), + "Only data types supported [in] QS16 -> [out] F32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16), + "Only data types supported [in] F32 -> [out] QS8, QS16"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16 + && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U8 -> [out] U16, S16, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] S16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] U32 -> [out] U8, U16, S16"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] S32 -> [out] U8, U16, S16"); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + // Get data sizes + const size_t input_size = data_size_from_type(input->info()->data_type()); + const size_t output_size = data_size_from_type(output->info()->data_type()); + + // Construct kernel name and build options + std::string kernel_name = "convert_depth"; + std::set build_opts; + if(input_size > output_size) + { + kernel_name += "_down"; + // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined + build_opts.insert(((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); + } + else + { + kernel_name += "_up"; + } + build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) + { + build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); + } + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set shift arg + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, shift); + + // Configure kernel + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp deleted file mode 100644 index e86c55fbc0..0000000000 --- a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -using namespace arm_compute; - -namespace -{ -/** Calculates expected output shape dimension - * - * @param[in] Input shape - * - * @return Expected output shape - */ -TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info) -{ - unsigned int output_width = 0; - unsigned int output_height = 0; - - std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info); - - TensorShape output_shape = input_shape; - output_shape.set(0, output_width); - output_shape.set(1, output_height); - - return output_shape; -} -} // namespace - -CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel() - : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0) -{ -} - -BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const -{ - return _border_size; -} - -void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3); - - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(weights->info()->data_type())) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - } - ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); - } - - // Get convolved dimensions - TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), - output_shape, - 1, - input->info()->data_type(), - input->info()->fixed_point_position(), - input->info()->quantization_info()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _conv_stride_x = conv_info.stride().first; - _conv_stride_y = conv_info.stride().second; - _conv_pad_left = conv_info.pad_left(); - _conv_pad_top = conv_info.pad_top(); - _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); - - // Set build options - ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3); - CLBuildOptions build_opts; - build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x)); - build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); - - // Create kernel - std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3"; - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - - // Set static arguments - if(is_data_type_quantized_asymmetric(_input->info()->data_type())) - { - float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - - unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0); - - _kernel.setArg(idx++, -_input->info()->quantization_info().offset); - _kernel.setArg(idx++, -_weights->info()->quantization_info().offset); - _kernel.setArg(idx++, _output->info()->quantization_info().offset); - _kernel.setArg(idx++, output_multiplier); - _kernel.setArg(idx++, output_shift); - } - - // Configure the local work size for Bifrost with a value obtained - // via exhaustive autotuning for the MobileNets tensor shapes. - const GPUTarget gpu_target = get_arch_from_target(get_target()); - if(gpu_target == GPUTarget::BIFROST) - { - const size_t width = input->info()->dimension(0); - if(width >= 56) // 56 or 112 - { - _lws_hint = cl::NDRange(8, 5, 2); - } - else if(width >= 14) // 14 or 28 - { - _lws_hint = cl::NDRange(1, 5, 2); - } - else // 7 - { - _lws_hint = cl::NDRange(1, 1, 2); - } - } - - // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 2; - const unsigned int num_elems_written_per_iteration = 2; - const unsigned int num_elems_read_per_iteration = 3 + _conv_stride_x; - const unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1)); - - update_window_and_padding(win, input_access, weights_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure(win); -} - -void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Create input window and adjust - Window win_in = window; - win_in.adjust(Window::DimX, -_conv_pad_left, true); - win_in.adjust(Window::DimY, -_conv_pad_top, true); - win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x); - win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y); - - Window slice_in = win_in.first_slice_window_3D(); - Window slice_out = window.first_slice_window_3D(); - Window slice_weights = window.first_slice_window_3D(); - slice_weights.set_dimension_step(Window::DimX, 0); - slice_weights.set_dimension_step(Window::DimY, 0); - - // Set biases - if(_biases != nullptr) - { - unsigned int idx = 3 * num_arguments_per_3D_tensor(); - Window slice_biases; - slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); - add_1D_tensor_argument(idx, _biases, slice_biases); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_3D_tensor_argument(idx, _output, slice_out); - add_3D_tensor_argument(idx, _weights, slice_weights); - - enqueue(queue, *this, slice_out, _lws_hint); - } - while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in)); -} diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp new file mode 100644 index 0000000000..003f1f8330 --- /dev/null +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +using namespace arm_compute; + +namespace +{ +/** Calculates expected output shape dimension + * + * @param[in] Input shape + * + * @return Expected output shape + */ +TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info) +{ + unsigned int output_width = 0; + unsigned int output_height = 0; + + std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info); + + TensorShape output_shape = input_shape; + output_shape.set(0, output_width); + output_shape.set(1, output_height); + + return output_shape; +} +} // namespace + +CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel() + : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0) +{ +} + +BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const +{ + return _border_size; +} + +void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3); + + if(biases != nullptr) + { + if(is_data_type_quantized_asymmetric(weights->info()->data_type())) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); + } + + // Get convolved dimensions + TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), + output_shape, + 1, + input->info()->data_type(), + input->info()->fixed_point_position(), + input->info()->quantization_info()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _conv_stride_x = conv_info.stride().first; + _conv_stride_y = conv_info.stride().second; + _conv_pad_left = conv_info.pad_left(); + _conv_pad_top = conv_info.pad_top(); + _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); + + // Set build options + ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3); + CLBuildOptions build_opts; + build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x)); + build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); + + // Create kernel + std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3"; + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Set static arguments + if(is_data_type_quantized_asymmetric(_input->info()->data_type())) + { + float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + + unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0); + + _kernel.setArg(idx++, -_input->info()->quantization_info().offset); + _kernel.setArg(idx++, -_weights->info()->quantization_info().offset); + _kernel.setArg(idx++, _output->info()->quantization_info().offset); + _kernel.setArg(idx++, output_multiplier); + _kernel.setArg(idx++, output_shift); + } + + // Configure the local work size for Bifrost with a value obtained + // via exhaustive autotuning for the MobileNets tensor shapes. + const GPUTarget gpu_target = get_arch_from_target(get_target()); + if(gpu_target == GPUTarget::BIFROST) + { + const size_t width = input->info()->dimension(0); + if(width >= 56) // 56 or 112 + { + _lws_hint = cl::NDRange(8, 5, 2); + } + else if(width >= 14) // 14 or 28 + { + _lws_hint = cl::NDRange(1, 5, 2); + } + else // 7 + { + _lws_hint = cl::NDRange(1, 1, 2); + } + } + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 2; + const unsigned int num_elems_written_per_iteration = 2; + const unsigned int num_elems_read_per_iteration = 3 + _conv_stride_x; + const unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1)); + + update_window_and_padding(win, input_access, weights_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Create input window and adjust + Window win_in = window; + win_in.adjust(Window::DimX, -_conv_pad_left, true); + win_in.adjust(Window::DimY, -_conv_pad_top, true); + win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x); + win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y); + + Window slice_in = win_in.first_slice_window_3D(); + Window slice_out = window.first_slice_window_3D(); + Window slice_weights = window.first_slice_window_3D(); + slice_weights.set_dimension_step(Window::DimX, 0); + slice_weights.set_dimension_step(Window::DimY, 0); + + // Set biases + if(_biases != nullptr) + { + unsigned int idx = 3 * num_arguments_per_3D_tensor(); + Window slice_biases; + slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); + add_1D_tensor_argument(idx, _biases, slice_biases); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + add_3D_tensor_argument(idx, _weights, slice_weights); + + enqueue(queue, *this, slice_out, _lws_hint); + } + while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in)); +} diff --git a/src/core/CL/kernels/CLL2NormalizeKernel.cpp b/src/core/CL/kernels/CLL2NormalizeKernel.cpp deleted file mode 100644 index 3e0758c980..0000000000 --- a/src/core/CL/kernels/CLL2NormalizeKernel.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/FixedPoint.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLL2NormalizeKernel::CLL2NormalizeKernel() - : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12) -{ -} - -void CLL2NormalizeKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - - // Sum and output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0"); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - _input = input; - _sum = sum; - _output = output; - _axis = axis; - _epsilon = epsilon; - - const unsigned int num_elems_processed_per_iteration = 16; - - // Set build options - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts)); - - // Set epsilon argument - unsigned int idx = num_arguments_per_1D_tensor() * 3; - _kernel.setArg(idx, _epsilon); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure(win); -} - -void CLL2NormalizeKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - - Window in_slice = window.first_slice_window_1D(); - Window sum_slice = window_sum.first_slice_window_1D(); - - do - { - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _sum, sum_slice); - add_1D_tensor_argument(idx, _output, in_slice); - enqueue(queue, *this, in_slice); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); -} diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp new file mode 100644 index 0000000000..36e351e048 --- /dev/null +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel() + : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12) +{ +} + +void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Sum and output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0"); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _sum = sum; + _output = output; + _axis = axis; + _epsilon = epsilon; + + const unsigned int num_elems_processed_per_iteration = 16; + + // Set build options + std::set build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts)); + + // Set epsilon argument + unsigned int idx = num_arguments_per_1D_tensor() * 3; + _kernel.setArg(idx, _epsilon); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_sum(window); + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + + Window in_slice = window.first_slice_window_1D(); + Window sum_slice = window_sum.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _sum, sum_slice); + add_1D_tensor_argument(idx, _output, in_slice); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp deleted file mode 100644 index b90a8e7b89..0000000000 --- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" -#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -GCDepthConcatenateKernel::GCDepthConcatenateKernel() - : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) -{ -} - -BorderSize GCDepthConcatenateKernel::border_size() const -{ - return BorderSize(_top_bottom, _left_right); -} - -void GCDepthConcatenateKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); - - // The gaps between the two lowest dimensions of input and output need to be divisible by 2 - // Otherwise it is not clear how the padding should be added onto the input tensor - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); - - _input = input; - _output = output; - - // Add build options - std::set build_opts; - std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; - build_opts.emplace(("#define " + dt_name)); - build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); - - // Configure kernel window - _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; - _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - - const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2]; - - build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right)); - build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom)); - build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes)); - - // Create kernel - _kernel = static_cast(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); - - unsigned int num_elems_processed_per_iteration = 1; - unsigned int num_elems_read_per_iteration = 1; - if(input->info()->data_type() == DataType::F32) - { - num_elems_processed_per_iteration = 1; - num_elems_read_per_iteration = 1; - } - else if(input->info()->data_type() == DataType::F16) - { - num_elems_processed_per_iteration = 4; - num_elems_read_per_iteration = 4; - } - const unsigned int num_rows_read_per_iteration = 1; - - // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); - - AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - IGCKernel::configure(win); -} - -void GCDepthConcatenateKernel::run(const Window &window) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); - - _kernel.use(); - - Window slice = window.first_slice_window_3D(); - - do - { - if(_input->info()->data_type() == DataType::F32) - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, 1, slice); - add_3D_tensor_argument(idx, _output, 2, slice); - } - else if(_input->info()->data_type() == DataType::F16) - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice); - add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice); - } - - _kernel.update_shader_params(); - - enqueue(*this, slice); - } - while(window.slide_window_slice_3D(slice)); -} diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp new file mode 100644 index 0000000000..a6111782fd --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) +{ +} + +BorderSize GCDepthConcatenateLayerKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + + // Add build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Configure kernel window + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2]; + + build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right)); + build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom)); + build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes)); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); + + unsigned int num_elems_processed_per_iteration = 1; + unsigned int num_elems_read_per_iteration = 1; + if(input->info()->data_type() == DataType::F32) + { + num_elems_processed_per_iteration = 1; + num_elems_read_per_iteration = 1; + } + else if(input->info()->data_type() == DataType::F16) + { + num_elems_processed_per_iteration = 4; + num_elems_read_per_iteration = 4; + } + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); + + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + IGCKernel::configure(win); +} + +void GCDepthConcatenateLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + if(_input->info()->data_type() == DataType::F32) + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _output, 2, slice); + } + else if(_input->info()->data_type() == DataType::F16) + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice); + add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice); + } + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp deleted file mode 100644 index 7a62b0cb03..0000000000 --- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include - -using namespace arm_compute; - -namespace -{ -// Overloads of 128-bit vector loads -uint8x16_t loadq(const uint8_t *ptr) -{ - return vld1q_u8(ptr); -} -uint16x8_t loadq(const uint16_t *ptr) -{ - return vld1q_u16(ptr); -} -uint32x4_t loadq(const uint32_t *ptr) -{ - return vld1q_u32(ptr); -} -// Overloads of 128-bit vector stores -void storeq(uint8_t *ptr, uint8x16_t val) -{ - return vst1q_u8(ptr, val); -} -void storeq(uint16_t *ptr, uint16x8_t val) -{ - return vst1q_u16(ptr, val); -} -void storeq(uint32_t *ptr, uint32x4_t val) -{ - return vst1q_u32(ptr, val); -} - -template -void depth_concat(const ITensor *in, ITensor *out, std::pair start_xy, int depth_offset, const Window &window) -{ - const int start_x = start_xy.first; - const int start_y = start_xy.second; - - // Offset input - const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1]; - uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes; - - // Offset output - const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2]; - uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes; - - Iterator input(in, window); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); - const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); - - storeq(out_ptr, loadq(in_ptr)); - }, - input, output); -} -} // namespace - -NEDepthConcatenateKernel::NEDepthConcatenateKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) -{ -} - -BorderSize NEDepthConcatenateKernel::border_size() const -{ - return BorderSize(_top_bottom, _left_right); -} - -void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); - - // The gaps between the two lowest dimensions of input and output need to be divisible by 2 - // Otherwise it is not clear how the padding should be added onto the input tensor - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); - - _func = nullptr; - _input = input; - _output = output; - _depth_offset = depth_offset; - _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; - _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - - switch(input->info()->data_type()) - { - case DataType::QS8: - _func = &depth_concat; - break; - case DataType::QS16: - case DataType::F16: - _func = &depth_concat; - break; - case DataType::F32: - _func = &depth_concat; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); - const unsigned int num_rows_read_per_iteration = 1; - - // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); - - AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -void NEDepthConcatenateKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window); -} diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp new file mode 100644 index 0000000000..01b0f10f70 --- /dev/null +++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +namespace +{ +// Overloads of 128-bit vector loads +uint8x16_t loadq(const uint8_t *ptr) +{ + return vld1q_u8(ptr); +} +uint16x8_t loadq(const uint16_t *ptr) +{ + return vld1q_u16(ptr); +} +uint32x4_t loadq(const uint32_t *ptr) +{ + return vld1q_u32(ptr); +} +// Overloads of 128-bit vector stores +void storeq(uint8_t *ptr, uint8x16_t val) +{ + return vst1q_u8(ptr, val); +} +void storeq(uint16_t *ptr, uint16x8_t val) +{ + return vst1q_u16(ptr, val); +} +void storeq(uint32_t *ptr, uint32x4_t val) +{ + return vst1q_u32(ptr, val); +} + +template +void depth_concat(const ITensor *in, ITensor *out, std::pair start_xy, int depth_offset, const Window &window) +{ + const int start_x = start_xy.first; + const int start_y = start_xy.second; + + // Offset input + const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1]; + uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes; + + // Offset output + const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2]; + uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes; + + Iterator input(in, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); + const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); + + storeq(out_ptr, loadq(in_ptr)); + }, + input, output); +} +} // namespace + +NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) +{ +} + +BorderSize NEDepthConcatenateLayerKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _func = nullptr; + _input = input; + _output = output; + _depth_offset = depth_offset; + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &depth_concat; + break; + case DataType::QS16: + case DataType::F16: + _func = &depth_concat; + break; + case DataType::F32: + _func = &depth_concat; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); + + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEDepthConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window); +} diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp deleted file mode 100644 index d97a20be65..0000000000 --- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp +++ /dev/null @@ -1,524 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" - -#include - -using namespace arm_compute; - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -NEDepthConvertKernel::NEDepthConvertKernel() - : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0) -{ -} - -void NEDepthConvertKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32); - - _input = input; - _output = input; - _policy = policy; - _shift = shift; - - if(output != nullptr) - { - // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - // Set output - _output = output; - } - - // Set initial fixed point position of input and output - _fixed_point_position_input = input->info()->fixed_point_position(); - _fixed_point_position_output = _output->info()->fixed_point_position(); - - // Set the fixed point position to the output tensor if needed - if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type())) - { - // If in-place set the fixed point position of the output tensor to be equal to shift - _fixed_point_position_output = (_input == _output) ? static_cast(_shift) : _fixed_point_position_output; - // Set fixed point position to output tensor - _output->info()->set_fixed_point_position(_fixed_point_position_output); - } - - ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type()))); - ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type()))); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 - && output->info()->data_type() != DataType::S32), - "Only data_types supported [in] U8 -> [out] U16, S16, S32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS8 -> [out] QS8, F32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), - "Only data_types supported [in] U16 -> [out] U8, U32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), - "Only data_types supported [in] S16 -> [out] U8, S32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS16 -> [out] QS16, F32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16), - "Only data_types supported [in] F32 -> [out] QS8, QS16"); - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - if(output != nullptr) - { - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - } - else - { - // In-place computation - update_window_and_padding(win, input_access); - } - ICPPKernel::configure(win); -} - -void NEDepthConvertKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(nullptr == _input); - ARM_COMPUTE_ERROR_ON(nullptr == _output); - ARM_COMPUTE_ERROR_ON(_input == _output); - - Iterator input(_input, window); - Iterator output(_output, window); - - bool in_place = (_input == _output); - - switch(_input->info()->data_type()) - { - case DataType::U8: - { - const int16x8_t b = vdupq_n_s16(_shift); - - switch(_output->info()->data_type()) - { - case DataType::S16: - { - /* Up-conversion U8 -> S16 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - - const int16x8x2_t texels = - { - { - vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), - vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) - } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), texels.val[0]); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); - }, - input, output); - break; - } - case DataType::S32: - { - /* Up-conversion U8 -> S32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - - const int16x8x2_t texels = - { - { - vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), - vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) - } - }; - - vst1q_s32(reinterpret_cast(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(reinterpret_cast(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(reinterpret_cast(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(reinterpret_cast(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - }, - input, output); - break; - } - case DataType::U16: - { - /* Up-conversion U8 -> U16 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - - const uint16x8x2_t texels = - { - { - vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b), - vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b) - } - }; - - vst1q_u16(reinterpret_cast(output.ptr()), texels.val[0]); - vst1q_u16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::QS8: - { - switch(_output->info()->data_type()) - { - case DataType::QS8: - { - const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; - /* Fixed point position conversion QS8 -> QS8 */ - if(relative_shift != 0 || !in_place) - { - const auto relative_shift_vec = vdupq_n_qs8(relative_shift); - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); - vst1q_qs8(reinterpret_cast(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec)); - }, - input, output); - } - break; - } - case DataType::F32: - { - /* Up-conversion QS8 -> F32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); - - float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input); - float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input); - - vst1q_f32(reinterpret_cast(output.ptr()), texels_low.val[0]); - vst1q_f32(reinterpret_cast(output.ptr()) + 4, texels_low.val[1]); - vst1q_f32(reinterpret_cast(output.ptr()) + 8, texels_high.val[0]); - vst1q_f32(reinterpret_cast(output.ptr()) + 12, texels_high.val[1]); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::S16: - { - switch(_output->info()->data_type()) - { - case DataType::U8: - { - const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); - - /* Down-conversion S16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t texels = - { - { - vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), - vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) - } - }; - - vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); - }, - input, output); - } - else - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t texels = - { - { - vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), - vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) - } - }; - - vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), - vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); - }, - input, output); - } - break; - } - case DataType::S32: - { - const int32x4_t b = vdupq_n_s32(_shift); - - /* Up-conversion S16 -> S32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t texels = - { - { - vld1q_s16(reinterpret_cast(input.ptr())), - vld1q_s16(reinterpret_cast(input.ptr()) + 8) - } - }; - - const int32x4x4_t texels_s32 = - { - { - vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b), - vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b), - vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b), - vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b) - } - }; - - vst1q_s32(reinterpret_cast(output.ptr()), texels_s32.val[0]); - vst1q_s32(reinterpret_cast(output.ptr()) + 4, texels_s32.val[1]); - vst1q_s32(reinterpret_cast(output.ptr()) + 8, texels_s32.val[2]); - vst1q_s32(reinterpret_cast(output.ptr()) + 12, texels_s32.val[3]); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::U16: - { - switch(_output->info()->data_type()) - { - case DataType::U8: - { - const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); - - /* Down-conversion U16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const uint16x8x2_t texels = - { - { - vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), - vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) - } - }; - - vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); - }, - input, output); - } - else - { - execute_window_loop(window, [&](const Coordinates & id) - { - const uint16x8x2_t texels = - { - { - vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), - vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) - } - }; - - vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); - }, - input, output); - } - break; - } - case DataType::U32: - { - const int32x4_t b = vdupq_n_s32(_shift); - - /* Up-conversion U16 -> U32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const uint16x8x2_t texels = - { - { - vld1q_u16(reinterpret_cast(input.ptr())), - vld1q_u16(reinterpret_cast(input.ptr()) + 8) - } - }; - - vst1q_u32(reinterpret_cast(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b)); - vst1q_u32(reinterpret_cast(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b)); - vst1q_u32(reinterpret_cast(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b)); - vst1q_u32(reinterpret_cast(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b)); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::QS16: - { - switch(_output->info()->data_type()) - { - case DataType::QS16: - { - const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; - /* Fixed point position conversion QS16 -> QS16 */ - if(relative_shift != 0 || !in_place) - { - const auto relative_shift_vec = vdupq_n_qs16(relative_shift); - execute_window_loop(window, [&](const Coordinates & id) - { - const qint16x8x2_t texels_qs16 = - { - { - vld1q_qs16(reinterpret_cast(input.ptr())), - vld1q_qs16(reinterpret_cast(input.ptr()) + 8) - } - }; - vst1q_qs16(reinterpret_cast(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec)); - vst1q_qs16(reinterpret_cast(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec)); - }, - input, output); - } - break; - } - case DataType::F32: - { - /* Up-conversion QS16 -> F32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t texels_qs16 = - { - { - vld1q_s16(reinterpret_cast(input.ptr())), - vld1q_s16(reinterpret_cast(input.ptr()) + 8) - } - }; - - vst1q_f32(reinterpret_cast(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input)); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::F32: - { - switch(_output->info()->data_type()) - { - case DataType::QS8: - { - /* Down-conversion F32 -> QS8 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x4_t texels_f32 = - { - { - vld1q_f32(reinterpret_cast(input.ptr())), - vld1q_f32(reinterpret_cast(input.ptr()) + 4), - vld1q_f32(reinterpret_cast(input.ptr()) + 8), - vld1q_f32(reinterpret_cast(input.ptr()) + 12) - } - }; - - const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output); - - vst1q_s8(reinterpret_cast(output.ptr()), texels_s8); - }, - input, output); - break; - } - case DataType::QS16: - { - /* Down-conversion F32 -> QS16 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x2_t texels_f32_1 = - { - { - vld1q_f32(reinterpret_cast(input.ptr())), - vld1q_f32(reinterpret_cast(input.ptr()) + 4), - } - }; - const float32x4x2_t texels_f32_2 = - { - { - vld1q_f32(reinterpret_cast(input.ptr()) + 8), - vld1q_f32(reinterpret_cast(input.ptr()) + 12) - } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output)); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output)); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } -} diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp new file mode 100644 index 0000000000..c29cb57513 --- /dev/null +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEDepthConvertLayerKernel::NEDepthConvertLayerKernel() + : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0) +{ +} + +void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32); + + _input = input; + _output = input; + _policy = policy; + _shift = shift; + + if(output != nullptr) + { + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + // Set output + _output = output; + } + + // Set initial fixed point position of input and output + _fixed_point_position_input = input->info()->fixed_point_position(); + _fixed_point_position_output = _output->info()->fixed_point_position(); + + // Set the fixed point position to the output tensor if needed + if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type())) + { + // If in-place set the fixed point position of the output tensor to be equal to shift + _fixed_point_position_output = (_input == _output) ? static_cast(_shift) : _fixed_point_position_output; + // Set fixed point position to output tensor + _output->info()->set_fixed_point_position(_fixed_point_position_output); + } + + ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type()))); + ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type()))); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32), + "Only data_types supported [in] QS8 -> [out] QS8, F32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), + "Only data_types supported [in] U16 -> [out] U8, U32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), + "Only data_types supported [in] S16 -> [out] U8, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32), + "Only data_types supported [in] QS16 -> [out] QS16, F32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16), + "Only data_types supported [in] F32 -> [out] QS8, QS16"); + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + if(output != nullptr) + { + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + } + else + { + // In-place computation + update_window_and_padding(win, input_access); + } + ICPPKernel::configure(win); +} + +void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(nullptr == _input); + ARM_COMPUTE_ERROR_ON(nullptr == _output); + ARM_COMPUTE_ERROR_ON(_input == _output); + + Iterator input(_input, window); + Iterator output(_output, window); + + bool in_place = (_input == _output); + + switch(_input->info()->data_type()) + { + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(_shift); + + switch(_output->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const int16x8x2_t texels = + { + { + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), texels.val[0]); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); + }, + input, output); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const int16x8x2_t texels = + { + { + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) + } + }; + + vst1q_s32(reinterpret_cast(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + }, + input, output); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const uint16x8x2_t texels = + { + { + vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b), + vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b) + } + }; + + vst1q_u16(reinterpret_cast(output.ptr()), texels.val[0]); + vst1q_u16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::QS8: + { + switch(_output->info()->data_type()) + { + case DataType::QS8: + { + const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; + /* Fixed point position conversion QS8 -> QS8 */ + if(relative_shift != 0 || !in_place) + { + const auto relative_shift_vec = vdupq_n_qs8(relative_shift); + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); + vst1q_qs8(reinterpret_cast(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec)); + }, + input, output); + } + break; + } + case DataType::F32: + { + /* Up-conversion QS8 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); + + float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input); + float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input); + + vst1q_f32(reinterpret_cast(output.ptr()), texels_low.val[0]); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, texels_low.val[1]); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, texels_high.val[0]); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, texels_high.val[1]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::S16: + { + switch(_output->info()->data_type()) + { + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); + + /* Down-conversion S16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), + vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + }, + input, output); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), + vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + }, + input, output); + } + break; + } + case DataType::S32: + { + const int32x4_t b = vdupq_n_s32(_shift); + + /* Up-conversion S16 -> S32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vld1q_s16(reinterpret_cast(input.ptr())), + vld1q_s16(reinterpret_cast(input.ptr()) + 8) + } + }; + + const int32x4x4_t texels_s32 = + { + { + vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b), + vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b), + vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b), + vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b) + } + }; + + vst1q_s32(reinterpret_cast(output.ptr()), texels_s32.val[0]); + vst1q_s32(reinterpret_cast(output.ptr()) + 4, texels_s32.val[1]); + vst1q_s32(reinterpret_cast(output.ptr()) + 8, texels_s32.val[2]); + vst1q_s32(reinterpret_cast(output.ptr()) + 12, texels_s32.val[3]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::U16: + { + switch(_output->info()->data_type()) + { + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); + + /* Down-conversion U16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), + vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + }, + input, output); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), + vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + }, + input, output); + } + break; + } + case DataType::U32: + { + const int32x4_t b = vdupq_n_s32(_shift); + + /* Up-conversion U16 -> U32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(reinterpret_cast(input.ptr())), + vld1q_u16(reinterpret_cast(input.ptr()) + 8) + } + }; + + vst1q_u32(reinterpret_cast(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::QS16: + { + switch(_output->info()->data_type()) + { + case DataType::QS16: + { + const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; + /* Fixed point position conversion QS16 -> QS16 */ + if(relative_shift != 0 || !in_place) + { + const auto relative_shift_vec = vdupq_n_qs16(relative_shift); + execute_window_loop(window, [&](const Coordinates & id) + { + const qint16x8x2_t texels_qs16 = + { + { + vld1q_qs16(reinterpret_cast(input.ptr())), + vld1q_qs16(reinterpret_cast(input.ptr()) + 8) + } + }; + vst1q_qs16(reinterpret_cast(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec)); + vst1q_qs16(reinterpret_cast(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec)); + }, + input, output); + } + break; + } + case DataType::F32: + { + /* Up-conversion QS16 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels_qs16 = + { + { + vld1q_s16(reinterpret_cast(input.ptr())), + vld1q_s16(reinterpret_cast(input.ptr()) + 8) + } + }; + + vst1q_f32(reinterpret_cast(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input)); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input)); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input)); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::F32: + { + switch(_output->info()->data_type()) + { + case DataType::QS8: + { + /* Down-conversion F32 -> QS8 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t texels_f32 = + { + { + vld1q_f32(reinterpret_cast(input.ptr())), + vld1q_f32(reinterpret_cast(input.ptr()) + 4), + vld1q_f32(reinterpret_cast(input.ptr()) + 8), + vld1q_f32(reinterpret_cast(input.ptr()) + 12) + } + }; + + const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output); + + vst1q_s8(reinterpret_cast(output.ptr()), texels_s8); + }, + input, output); + break; + } + case DataType::QS16: + { + /* Down-conversion F32 -> QS16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x2_t texels_f32_1 = + { + { + vld1q_f32(reinterpret_cast(input.ptr())), + vld1q_f32(reinterpret_cast(input.ptr()) + 4), + } + }; + const float32x4x2_t texels_f32_2 = + { + { + vld1q_f32(reinterpret_cast(input.ptr()) + 8), + vld1q_f32(reinterpret_cast(input.ptr()) + 12) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output)); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} diff --git a/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp deleted file mode 100644 index 5c4bd34e05..0000000000 --- a/src/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.cpp +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolution3x3Kernel.h" -#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -using namespace arm_compute; -using namespace arm_compute::detail; - -NEDepthwiseConvolution3x3Kernel::NEDepthwiseConvolution3x3Kernel() - : _border_size(0), _input(), _output(), _weights(), _conv_info() -{ -} - -BorderSize NEDepthwiseConvolution3x3Kernel::border_size() const -{ - return _border_size; -} - -void NEDepthwiseConvolution3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3); - - std::pair expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(), - weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(), - conv_info); - - ARM_COMPUTE_UNUSED(expected_output); - ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x()); - ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y()); - - _input = input; - _output = output; - _weights = weights; - _conv_info = conv_info; - const unsigned int conv_stride_x = conv_info.stride().first; - const unsigned int conv_pad_x = conv_info.pad().first; - const unsigned int conv_pad_y = conv_info.pad().second; - - ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3); - - const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x; - _border_size = BorderSize(conv_pad_y, conv_pad_x); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration)); - - AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input->info()->dimension(0) + _border_size.right, input->info()->dimension(1) + _border_size.bottom); - AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -template -class convolver_3x3 -{ -public: - static void convolve(const Window &window, unsigned int num_elems_written_per_iteration, - const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) - { - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int kernel_stride_y = weights->info()->strides_in_bytes().y(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); - const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); - - // setup output window for the iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); - - // setup input window for the iterator - Window window_in = window; - // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0 - window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - - Iterator in(input, window_in); - Iterator out(output, window_out); - Iterator w(weights, window_k); - - const uint8_t *weights_ptr = w.ptr(); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y; - int ih = 0; - int oh = 0; - - const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z; - const auto ptr_weights_r0 = reinterpret_cast(ptr_weights_base); - const auto ptr_weights_r1 = reinterpret_cast(ptr_weights_base + kernel_stride_y); - const auto ptr_weights_r2 = reinterpret_cast(ptr_weights_base + kernel_stride_y * 2); - const auto vw_r0 = load_matrix_row(ptr_weights_r0); - const auto vw_r1 = load_matrix_row(ptr_weights_r1); - const auto vw_r2 = load_matrix_row(ptr_weights_r2); - - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - auto in_top = reinterpret_cast(input_ptr + (ih + 0) * input_stride_y); - auto in_mid = reinterpret_cast(input_ptr + (ih + 1) * input_stride_y); - auto in_low = reinterpret_cast(input_ptr + (ih + 2) * input_stride_y); - auto p_out = reinterpret_cast(out.ptr() + oh * output_stride_y); - - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, - in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) - { - auto vres = convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0); - store_results(p_out, vres); - } - } - }, - in, out); - } -}; - -void NEDepthwiseConvolution3x3Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_UNUSED(info); - - const unsigned int conv_stride_x = _conv_info.stride().first; - const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x; - - switch(conv_stride_x) - { - case 1: - convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; - case 2: - convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; - case 3: - convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp new file mode 100644 index 0000000000..02962e0492 --- /dev/null +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; +using namespace arm_compute::detail; + +NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel() + : _border_size(0), _input(), _output(), _weights(), _conv_info() +{ +} + +BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const +{ + return _border_size; +} + +void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3); + + std::pair expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(), + weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(), + conv_info); + + ARM_COMPUTE_UNUSED(expected_output); + ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x()); + ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y()); + + _input = input; + _output = output; + _weights = weights; + _conv_info = conv_info; + const unsigned int conv_stride_x = conv_info.stride().first; + const unsigned int conv_pad_x = conv_info.pad().first; + const unsigned int conv_pad_y = conv_info.pad().second; + + ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3); + + const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x; + _border_size = BorderSize(conv_pad_y, conv_pad_x); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration)); + + AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input->info()->dimension(0) + _border_size.right, input->info()->dimension(1) + _border_size.bottom); + AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +template +class convolver_3x3 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_written_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + { + const int input_stride_x = input->info()->strides_in_bytes().x(); + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + + Iterator in(input, window_in); + Iterator out(output, window_out); + Iterator w(weights, window_k); + + const uint8_t *weights_ptr = w.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y; + int ih = 0; + int oh = 0; + + const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z; + const auto ptr_weights_r0 = reinterpret_cast(ptr_weights_base); + const auto ptr_weights_r1 = reinterpret_cast(ptr_weights_base + kernel_stride_y); + const auto ptr_weights_r2 = reinterpret_cast(ptr_weights_base + kernel_stride_y * 2); + const auto vw_r0 = load_matrix_row(ptr_weights_r0); + const auto vw_r1 = load_matrix_row(ptr_weights_r1); + const auto vw_r2 = load_matrix_row(ptr_weights_r2); + + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_top = reinterpret_cast(input_ptr + (ih + 0) * input_stride_y); + auto in_mid = reinterpret_cast(input_ptr + (ih + 1) * input_stride_y); + auto in_low = reinterpret_cast(input_ptr + (ih + 2) * input_stride_y); + auto p_out = reinterpret_cast(out.ptr() + oh * output_stride_y); + + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) + { + auto vres = convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0); + store_results(p_out, vres); + } + } + }, + in, out); + } +}; + +void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(info); + + const unsigned int conv_stride_x = _conv_info.stride().first; + const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x; + + switch(conv_stride_x) + { + case 1: + convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + break; + case 2: + convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + break; + case 3: + convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} diff --git a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp deleted file mode 100644 index 12c532afd5..0000000000 --- a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include -#include - -using namespace arm_compute; - -namespace -{ -void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window) -{ - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - - Window in_slice = window.first_slice_window_1D(); - Window sum_slice = window_sum.first_slice_window_1D(); - - do - { - Iterator input_it(in, in_slice); - Iterator sum_it(sum, sum_slice); - Iterator output_it(out, in_slice); - - const float sum_value = *reinterpret_cast(sum_it.ptr()); - const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon))); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input_it.ptr()); - const auto out_ptr = reinterpret_cast(output_it.ptr()); - - vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value)); - }, - input_it, output_it); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); -} -} // namespace - -NEL2NormalizeKernel::NEL2NormalizeKernel() - : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12) -{ -} - -void NEL2NormalizeKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); - ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions"); - ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0"); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); - unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration; - - _input = input; - _sum = sum; - _output = output; - _axis = axis; - _epsilon = epsilon; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, sum_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - INEKernel::configure(win); -} - -void NEL2NormalizeKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - switch(_axis) - { - case 0: - l2_normalize_X(_input, _sum, _output, _epsilon, window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported normalization axis"); - } -} diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp new file mode 100644 index 0000000000..3bf1d9400e --- /dev/null +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +namespace +{ +void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window) +{ + Window window_sum(window); + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + + Window in_slice = window.first_slice_window_1D(); + Window sum_slice = window_sum.first_slice_window_1D(); + + do + { + Iterator input_it(in, in_slice); + Iterator sum_it(sum, sum_slice); + Iterator output_it(out, in_slice); + + const float sum_value = *reinterpret_cast(sum_it.ptr()); + const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon))); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input_it.ptr()); + const auto out_ptr = reinterpret_cast(output_it.ptr()); + + vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value)); + }, + input_it, output_it); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); +} +} // namespace + +NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel() + : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12) +{ +} + +void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); + ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions"); + ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0"); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); + unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration; + + _input = input; + _sum = sum; + _output = output; + _axis = axis; + _epsilon = epsilon; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_axis) + { + case 0: + l2_normalize_X(_input, _sum, _output, _epsilon, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported normalization axis"); + } +} diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp index 647f88f0e2..8f2bf23ce3 100644 --- a/src/graph/operations/CLSimpleOperations.cpp +++ b/src/graph/operations/CLSimpleOperations.cpp @@ -106,7 +106,7 @@ REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationT return std::move(batch_norm); } -/* DepthConvert Layer */ +/* DepthConvertLayer Layer */ REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::DepthConvertLayer) { ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1); @@ -121,7 +121,7 @@ REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::D const auto shift = ctx.parameter("shift"); // Create and configure function - auto depthconvert = arm_compute::support::cpp14::make_unique(); + auto depthconvert = arm_compute::support::cpp14::make_unique(); depthconvert->configure(in, out, conv_policy, shift); // Log info @@ -156,13 +156,13 @@ REGISTER_SIMPLE_OPERATION(CLDepthwiseConvolutionOperation, OPENCL, OperationType bool run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3; if(run_3x3_opt) { - auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); depwthwise_conv->configure(in, weights, biases, out, conv_info); func = std::move(depwthwise_conv); } else { - auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); depwthwise_conv->configure(in, weights, biases, out, conv_info); func = std::move(depwthwise_conv); } @@ -313,7 +313,7 @@ REGISTER_SIMPLE_OPERATION(CLL2NormalizeLayerOperation, OPENCL, OperationType::L2 const auto epsilon = ctx.parameter("epsilon"); // Create and configure function - auto l2_norm = arm_compute::support::cpp14::make_unique(); + auto l2_norm = arm_compute::support::cpp14::make_unique(); l2_norm->configure(in, out, axis, epsilon); // Log info diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp index f234341cec..bb99e8da4b 100644 --- a/src/graph/operations/NESimpleOperations.cpp +++ b/src/graph/operations/NESimpleOperations.cpp @@ -106,7 +106,7 @@ REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationTyp return std::move(batch_norm); } -/* DepthConvert Layer */ +/* DepthConvertLayer Layer */ REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::DepthConvertLayer) { ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1); @@ -121,7 +121,7 @@ REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::Dep const auto shift = ctx.parameter("shift"); // Create and configure function - auto depthconvert = arm_compute::support::cpp14::make_unique(); + auto depthconvert = arm_compute::support::cpp14::make_unique(); depthconvert->configure(in, out, conv_policy, shift); // Log info @@ -156,13 +156,13 @@ REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType:: bool run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3; if(run_3x3_opt) { - auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); depwthwise_conv->configure(in, weights, biases, out, conv_info); func = std::move(depwthwise_conv); } else { - auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); depwthwise_conv->configure(in, weights, biases, out, conv_info); func = std::move(depwthwise_conv); } @@ -313,7 +313,7 @@ REGISTER_SIMPLE_OPERATION(NEL2NormalizeLayerOperation, NEON, OperationType::L2No const auto epsilon = ctx.parameter("epsilon"); // Create and configure function - auto l2_norm = arm_compute::support::cpp14::make_unique(); + auto l2_norm = arm_compute::support::cpp14::make_unique(); l2_norm->configure(in, out, axis, epsilon); // Log info diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp deleted file mode 100644 index 89e44ca98e..0000000000 --- a/src/runtime/CL/functions/CLDepthConcatenate.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLDepthConcatenate::CLDepthConcatenate() // NOLINT - : _inputs_vector(), - _concat_kernels_vector(), - _border_handlers_vector(), - _num_inputs(0) -{ -} - -void CLDepthConcatenate::configure(std::vector inputs_vector, ICLTensor *output) // NOLINT -{ - ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); - - _num_inputs = inputs_vector.size(); - - unsigned int depth_offset = 0; - - _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - - TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position()); - - for(unsigned int i = 0; i < _num_inputs; i++) - { - _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); - _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); - - depth_offset += inputs_vector.at(i)->info()->dimension(2); - } -} - -void CLDepthConcatenate::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - for(unsigned i = 0; i < _num_inputs; i++) - { - CLScheduler::get().enqueue(_border_handlers_vector[i], false); - CLScheduler::get().enqueue(_concat_kernels_vector[i], true); - } -} diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp new file mode 100644 index 0000000000..05b5d54cf7 --- /dev/null +++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT + : _inputs_vector(), + _concat_kernels_vector(), + _border_handlers_vector(), + _num_inputs(0) +{ +} + +void CLDepthConcatenateLayer::configure(std::vector inputs_vector, ICLTensor *output) // NOLINT +{ + ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); + + _num_inputs = inputs_vector.size(); + + unsigned int depth_offset = 0; + + _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + + TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position()); + + for(unsigned int i = 0; i < _num_inputs; i++) + { + _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); + _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); + + depth_offset += inputs_vector.at(i)->info()->dimension(2); + } +} + +void CLDepthConcatenateLayer::run() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + for(unsigned i = 0; i < _num_inputs; i++) + { + CLScheduler::get().enqueue(_border_handlers_vector[i], false); + CLScheduler::get().enqueue(_concat_kernels_vector[i], true); + } +} diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp deleted file mode 100644 index b64d05b8b1..0000000000 --- a/src/runtime/CL/functions/CLDepthConvert.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDepthConvert.h" - -#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" -#include "support/ToolchainSupport.h" - -#include - -using namespace arm_compute; - -void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, policy, shift); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp new file mode 100644 index 0000000000..b448465909 --- /dev/null +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" + +#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, policy, shift); + _kernel = std::move(k); +} diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp deleted file mode 100644 index 81149508dd..0000000000 --- a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3() - : _kernel(), _border_handler() -{ -} - -void CLDepthwiseConvolution3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - - _kernel.set_target(CLScheduler::get().target()); - _kernel.configure(input, weights, biases, output, conv_info); - - // Configure border handler - PixelValue &&zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type())) - { - zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); - } - _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value); -} - -void CLDepthwiseConvolution3x3::run() -{ - CLScheduler::get().enqueue(_border_handler); - CLScheduler::get().enqueue(_kernel); -} - -CLDepthwiseConvolution::CLDepthwiseConvolution() - : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(), - _v2mm_output() -{ -} - -void CLDepthwiseConvolution::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2)); - - const size_t weights_w = weights->info()->dimension(0); - const size_t weights_h = weights->info()->dimension(1); - const size_t weights_z = weights->info()->dimension(2); - - const bool has_bias = (biases != nullptr); - const GPUTarget gpu_target = CLScheduler::get().target(); - - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info); - - // Set up intermediate tensors - const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0); - const size_t conv_size = conv_w * conv_h; - - // Im2Col configuration - TensorShape shape_im2col = input->info()->tensor_shape(); - shape_im2col.set(0, patch_size); - shape_im2col.set(1, conv_size); - shape_im2col.set(2, weights_z); - const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position()); - _input_reshaped.allocator()->init(info_im2col); - _im2col_kernel.set_target(gpu_target); - _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias); - - // Weights reshape configuration - const TensorShape shape_weights_reshape(patch_size, weights_z); - const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position()); - _weights_reshaped.allocator()->init(info_weights_reshape); - _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases); - - // GEMV configuration - TensorShape shape_v2mm_out = input->info()->tensor_shape(); - shape_v2mm_out.set(0, conv_size * weights_z); - shape_v2mm_out.set(1, 1); - shape_v2mm_out.set(2, 1); - const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position()); - _v2mm_output.allocator()->init(info_v2mm_out); - _v2mm_kernel.set_target(gpu_target); - _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output); - _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h); - - BorderSize border_size = _v2mm_kernel.border_size(); - _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0)); - - border_size.bottom = 0; - _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0)); - - // Allocate intermediate tensors - _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); - _v2mm_output.allocator()->allocate(); -} - -void CLDepthwiseConvolution::run() -{ - CLScheduler::get().enqueue(_im2col_kernel); - - CLScheduler::get().enqueue(_weights_reshape_kernel); - - CLScheduler::get().enqueue(_v2mm_input_fill_border); - CLScheduler::get().enqueue(_v2mm_weights_fill_border); - CLScheduler::get().enqueue(_v2mm_kernel); - - CLScheduler::get().enqueue(_vector_to_tensor_kernel); -} diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp new file mode 100644 index 0000000000..02273fe08b --- /dev/null +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3() + : _kernel(), _border_handler() +{ +} + +void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + + _kernel.set_target(CLScheduler::get().target()); + _kernel.configure(input, weights, biases, output, conv_info); + + // Configure border handler + PixelValue &&zero_value(0.f); + if(is_data_type_quantized_asymmetric(input->info()->data_type())) + { + zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + } + _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value); +} + +void CLDepthwiseConvolutionLayer3x3::run() +{ + CLScheduler::get().enqueue(_border_handler); + CLScheduler::get().enqueue(_kernel); +} + +CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer() + : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(), + _v2mm_output() +{ +} + +void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2)); + + const size_t weights_w = weights->info()->dimension(0); + const size_t weights_h = weights->info()->dimension(1); + const size_t weights_z = weights->info()->dimension(2); + + const bool has_bias = (biases != nullptr); + const GPUTarget gpu_target = CLScheduler::get().target(); + + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info); + + // Set up intermediate tensors + const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0); + const size_t conv_size = conv_w * conv_h; + + // Im2Col configuration + TensorShape shape_im2col = input->info()->tensor_shape(); + shape_im2col.set(0, patch_size); + shape_im2col.set(1, conv_size); + shape_im2col.set(2, weights_z); + const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position()); + _input_reshaped.allocator()->init(info_im2col); + _im2col_kernel.set_target(gpu_target); + _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias); + + // Weights reshape configuration + const TensorShape shape_weights_reshape(patch_size, weights_z); + const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position()); + _weights_reshaped.allocator()->init(info_weights_reshape); + _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases); + + // GEMV configuration + TensorShape shape_v2mm_out = input->info()->tensor_shape(); + shape_v2mm_out.set(0, conv_size * weights_z); + shape_v2mm_out.set(1, 1); + shape_v2mm_out.set(2, 1); + const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position()); + _v2mm_output.allocator()->init(info_v2mm_out); + _v2mm_kernel.set_target(gpu_target); + _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output); + _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h); + + BorderSize border_size = _v2mm_kernel.border_size(); + _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0)); + + border_size.bottom = 0; + _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0)); + + // Allocate intermediate tensors + _input_reshaped.allocator()->allocate(); + _weights_reshaped.allocator()->allocate(); + _v2mm_output.allocator()->allocate(); +} + +void CLDepthwiseConvolutionLayer::run() +{ + CLScheduler::get().enqueue(_im2col_kernel); + + CLScheduler::get().enqueue(_weights_reshape_kernel); + + CLScheduler::get().enqueue(_v2mm_input_fill_border); + CLScheduler::get().enqueue(_v2mm_weights_fill_border); + CLScheduler::get().enqueue(_v2mm_kernel); + + CLScheduler::get().enqueue(_vector_to_tensor_kernel); +} diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp deleted file mode 100644 index 99be8cae4c..0000000000 --- a/src/runtime/CL/functions/CLL2Normalize.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLL2Normalize.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLL2Normalize::CLL2Normalize(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() -{ -} - -void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon) -{ - // Manage intermediate buffers - _memory_group.manage(&_sumsq); - - // Configure kernels - _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); - - // Allocate intermediate tensor - _sumsq.allocator()->allocate(); -} - -void CLL2Normalize::run() -{ - _memory_group.acquire(); - - _reduce_func.run(); - CLScheduler::get().enqueue(_normalize_kernel, true); - - _memory_group.release(); -} diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp new file mode 100644 index 0000000000..d1bb65f1c9 --- /dev/null +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() +{ +} + +void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon) +{ + // Manage intermediate buffers + _memory_group.manage(&_sumsq); + + // Configure kernels + _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); + _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); + + // Allocate intermediate tensor + _sumsq.allocator()->allocate(); +} + +void CLL2NormalizeLayer::run() +{ + _memory_group.acquire(); + + _reduce_func.run(); + CLScheduler::get().enqueue(_normalize_kernel, true); + + _memory_group.release(); +} diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp index a395487103..7e5278f380 100644 --- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp +++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp @@ -29,7 +29,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvert.h" +#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" #include "support/ToolchainSupport.h" diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp deleted file mode 100755 index ed756cf261..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -GCDepthConcatenate::GCDepthConcatenate() //NOLINT - : _concat_kernels_vector(), - _border_handlers_vector(), - _num_inputs(0) -{ -} - -void GCDepthConcatenate::configure(std::vector inputs_vector, IGCTensor *output) //NOLINT -{ - ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); - - _num_inputs = inputs_vector.size(); - - unsigned int depth_offset = 0; - - _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - - for(unsigned int i = 0; i < _num_inputs; i++) - { - _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); - _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); - - depth_offset += inputs_vector.at(i)->info()->dimension(2); - } -} - -void GCDepthConcatenate::run() -{ - for(unsigned i = 0; i < _num_inputs; i++) - { - GCScheduler::get().enqueue(_border_handlers_vector[i], false); - GCScheduler::get().enqueue(_concat_kernels_vector[i], true); - } -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp new file mode 100755 index 0000000000..ee0b121695 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCDepthConcatenateLayer::GCDepthConcatenateLayer() //NOLINT + : _concat_kernels_vector(), + _border_handlers_vector(), + _num_inputs(0) +{ +} + +void GCDepthConcatenateLayer::configure(std::vector inputs_vector, IGCTensor *output) //NOLINT +{ + ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); + + _num_inputs = inputs_vector.size(); + + unsigned int depth_offset = 0; + + _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + + for(unsigned int i = 0; i < _num_inputs; i++) + { + _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); + _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); + + depth_offset += inputs_vector.at(i)->info()->dimension(2); + } +} + +void GCDepthConcatenateLayer::run() +{ + for(unsigned i = 0; i < _num_inputs; i++) + { + GCScheduler::get().enqueue(_border_handlers_vector[i], false); + GCScheduler::get().enqueue(_concat_kernels_vector[i], true); + } +} diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp deleted file mode 100644 index f8ad2abe61..0000000000 --- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -NEDepthConcatenate::NEDepthConcatenate() // NOLINT - : _inputs_vector(), - _concat_kernels_vector(), - _border_handlers_vector(), - _num_inputs(0) -{ -} - -void NEDepthConcatenate::configure(std::vector inputs_vector, ITensor *output) // NOLINT -{ - ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); - - _num_inputs = inputs_vector.size(); - _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); - - TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position()); - - unsigned int depth_offset = 0; - for(unsigned int i = 0; i < _num_inputs; ++i) - { - _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); - _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); - - depth_offset += inputs_vector.at(i)->info()->dimension(2); - } -} - -void NEDepthConcatenate::run() -{ - for(unsigned i = 0; i < _num_inputs; ++i) - { - NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX); - NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX); - } -} diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp new file mode 100644 index 0000000000..437c9417ce --- /dev/null +++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT + : _inputs_vector(), + _concat_kernels_vector(), + _border_handlers_vector(), + _num_inputs(0) +{ +} + +void NEDepthConcatenateLayer::configure(std::vector inputs_vector, ITensor *output) // NOLINT +{ + ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); + + _num_inputs = inputs_vector.size(); + _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + + TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position()); + + unsigned int depth_offset = 0; + for(unsigned int i = 0; i < _num_inputs; ++i) + { + _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); + _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); + + depth_offset += inputs_vector.at(i)->info()->dimension(2); + } +} + +void NEDepthConcatenateLayer::run() +{ + for(unsigned i = 0; i < _num_inputs; ++i) + { + NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX); + NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX); + } +} diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp deleted file mode 100644 index 37857b6534..0000000000 --- a/src/runtime/NEON/functions/NEDepthConvert.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" - -#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" -#include "support/ToolchainSupport.h" - -#include - -using namespace arm_compute; - -void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, policy, shift); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp new file mode 100644 index 0000000000..9a75404fcd --- /dev/null +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" + +#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, policy, shift); + _kernel = std::move(k); +} diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp deleted file mode 100644 index e12bc07464..0000000000 --- a/src/runtime/NEON/functions/NEDepthwiseConvolution.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolution.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -NEDepthwiseConvolution3x3::NEDepthwiseConvolution3x3() - : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false) -{ -} - -void NEDepthwiseConvolution3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); - - // Call convolution kernel - _kernel.configure(input, weights, output, conv_info); - _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); - if(biases != nullptr) - { - _bias_kernel.configure(output, biases); - _has_bias = true; - } -} - -void NEDepthwiseConvolution3x3::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimX); - NEScheduler::get().schedule(&_kernel, Window::DimX); - if(_has_bias) - { - NEScheduler::get().schedule(&_bias_kernel, Window::DimX); - } -} - -NEDepthwiseConvolution::NEDepthwiseConvolution() - : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output() -{ -} - -void NEDepthwiseConvolution::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2)); - - const size_t weights_w = weights->info()->dimension(0); - const size_t weights_h = weights->info()->dimension(1); - const size_t weights_z = weights->info()->dimension(2); - - bool has_bias = (biases != nullptr); - - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info); - - // Set up intermediate tensors - const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0); - const size_t conv_size = conv_w * conv_h; - - // Im2Col configuration - TensorShape shape_im2col = input->info()->tensor_shape(); - shape_im2col.set(0, patch_size); - shape_im2col.set(1, conv_size); - shape_im2col.set(2, weights_z); - const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position()); - _input_reshaped.allocator()->init(info_im2col); - _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias); - - // Weights reshape configuration - const TensorShape shape_weights_reshape(patch_size, weights_z); - const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position()); - _weights_reshaped.allocator()->init(info_weights_reshape); - _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases); - - // GEMV configuration - TensorShape shape_v2mm_out = input->info()->tensor_shape(); - shape_v2mm_out.set(0, conv_size * weights_z); - shape_v2mm_out.set(1, 1); - shape_v2mm_out.set(2, 1); - const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position()); - _v2mm_output.allocator()->init(info_v2mm_out); - _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output); - _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h); - - // Allocate intermediate tensors - _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); - _v2mm_output.allocator()->allocate(); -} - -void NEDepthwiseConvolution::run() -{ - NEScheduler::get().schedule(&_im2col_kernel, Window::DimX); - NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); - NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX); - NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX); -} \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp new file mode 100644 index 0000000000..b890c6f5d5 --- /dev/null +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3() + : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false) +{ +} + +void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + // Call convolution kernel + _kernel.configure(input, weights, output, conv_info); + _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); + if(biases != nullptr) + { + _bias_kernel.configure(output, biases); + _has_bias = true; + } +} + +void NEDepthwiseConvolutionLayer3x3::run() +{ + NEScheduler::get().schedule(&_border_handler, Window::DimX); + NEScheduler::get().schedule(&_kernel, Window::DimX); + if(_has_bias) + { + NEScheduler::get().schedule(&_bias_kernel, Window::DimX); + } +} + +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer() + : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output() +{ +} + +void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2)); + + const size_t weights_w = weights->info()->dimension(0); + const size_t weights_h = weights->info()->dimension(1); + const size_t weights_z = weights->info()->dimension(2); + + bool has_bias = (biases != nullptr); + + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info); + + // Set up intermediate tensors + const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0); + const size_t conv_size = conv_w * conv_h; + + // Im2Col configuration + TensorShape shape_im2col = input->info()->tensor_shape(); + shape_im2col.set(0, patch_size); + shape_im2col.set(1, conv_size); + shape_im2col.set(2, weights_z); + const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position()); + _input_reshaped.allocator()->init(info_im2col); + _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias); + + // Weights reshape configuration + const TensorShape shape_weights_reshape(patch_size, weights_z); + const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position()); + _weights_reshaped.allocator()->init(info_weights_reshape); + _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases); + + // GEMV configuration + TensorShape shape_v2mm_out = input->info()->tensor_shape(); + shape_v2mm_out.set(0, conv_size * weights_z); + shape_v2mm_out.set(1, 1); + shape_v2mm_out.set(2, 1); + const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position()); + _v2mm_output.allocator()->init(info_v2mm_out); + _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output); + _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h); + + // Allocate intermediate tensors + _input_reshaped.allocator()->allocate(); + _weights_reshaped.allocator()->allocate(); + _v2mm_output.allocator()->allocate(); +} + +void NEDepthwiseConvolutionLayer::run() +{ + NEScheduler::get().schedule(&_im2col_kernel, Window::DimX); + NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); + NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX); + NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX); +} \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp deleted file mode 100644 index 349a781b0b..0000000000 --- a/src/runtime/NEON/functions/NEL2Normalize.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEL2Normalize::NEL2Normalize(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() -{ -} - -void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon) -{ - // Manage intermediate buffers - _memory_group.manage(&_sumsq); - - // Configure Kernels - _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); - - // Allocate intermediate tensors - _sumsq.allocator()->allocate(); -} - -void NEL2Normalize::run() -{ - _memory_group.acquire(); - - _reduce_func.run(); - NEScheduler::get().schedule(&_normalize_kernel, Window::DimY); - - _memory_group.release(); -} diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp new file mode 100644 index 0000000000..fa62483146 --- /dev/null +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() +{ +} + +void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon) +{ + // Manage intermediate buffers + _memory_group.manage(&_sumsq); + + // Configure Kernels + _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); + _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); + + // Allocate intermediate tensors + _sumsq.allocator()->allocate(); +} + +void NEL2NormalizeLayer::run() +{ + _memory_group.acquire(); + + _reduce_func.run(); + NEScheduler::get().schedule(&_normalize_kernel, Window::DimY); + + _memory_group.release(); +} diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp index a680f1f11d..0e149d4176 100644 --- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp +++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp @@ -28,7 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h" #include "arm_compute/runtime/Tensor.h" -- cgit v1.2.1