diff options
Diffstat (limited to 'src/core/CL/kernels')
63 files changed, 8686 insertions, 0 deletions
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp new file mode 100644 index 0000000000..685b8e234e --- /dev/null +++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "The output image can only be U8 if both input images are U8"); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp new file mode 100644 index 0000000000..6333f04e71 --- /dev/null +++ b/src/core/CL/kernels/CLAccumulateKernel.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate")); + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted")); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, alpha); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(shift > 15); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared")); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, shift); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp new file mode 100644 index 0000000000..83bbe6a3be --- /dev/null +++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + // Set build options + std::set<std::string> build_opts; + build_opts.insert(("-D" + string_from_activation_func(act_info.activation()))); + build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT")))); + build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.insert(("-DA=" + val_to_string(act_info.a()))); + build_opts.insert(("-DB=" + val_to_string(act_info.b()))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp new file mode 100644 index 0000000000..aaa62d0268 --- /dev/null +++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> +#include <set> +#include <string> + +using namespace arm_compute; + +CLArithmeticAdditionKernel::CLArithmeticAdditionKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + + _input1 = input1; + _input2 = input2; + _output = output; + + const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Check for invalid combination + if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8)) + { + ARM_COMPUTE_ERROR("You called with the wrong data types."); + } + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp new file mode 100644 index 0000000000..4c847276da --- /dev/null +++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + // Check for invalid combination + if(output->info()->data_type() == DataType::U8) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + } + else + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + } + + _input1 = input1; + _input2 = input2; + _output = output; + + bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..309a153b7a --- /dev/null +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel() + : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0) +{ +} + +void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0)); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + _input = input; + _output = output; + _mean = mean; + _var = var; + _beta = beta; + _gamma = gamma; + _epsilon = epsilon; + + // Create kernel + std::string kernel_name = "batchnormalization_layer"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set kernel static arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, _epsilon); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + Window vector_slice = window.first_slice_window_1D(); + vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + add_1D_tensor_argument(idx, _mean, vector_slice); + add_1D_tensor_argument(idx, _var, vector_slice); + add_1D_tensor_argument(idx, _beta, vector_slice); + add_1D_tensor_argument(idx, _gamma, vector_slice); + + do + { + idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp new file mode 100644 index 0000000000..5ea4a86da5 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseAndKernel::CLBitwiseAndKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} +void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp new file mode 100644 index 0000000000..0098e15ab6 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp new file mode 100644 index 0000000000..2eeef0a993 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseOrKernel::CLBitwiseOrKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp new file mode 100644 index 0000000000..c19a78e1c4 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseXorKernel::CLBitwiseXorKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp new file mode 100644 index 0000000000..e113d30210 --- /dev/null +++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLBox3x3Kernel::border_size() const +{ + return 1; +} + +void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Set build options + std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1", + "-DMAT3=1", "-DMAT4=1", "-DMAT5=1", + "-DMAT6=1", "-DMAT7=1", "-DMAT8=1", + "-DSCALE=9", "-DDATA_TYPE_OUT=uchar" + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp new file mode 100644 index 0000000000..5d06d34631 --- /dev/null +++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGradientKernel::CLGradientKernel() + : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) +{ +} + +void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()), + "Gx and Gy must have the same pixel size"); + ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()), + "Mag must have the same pixel size as Gx and Gy"); + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type())); + + // Create kernel + const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2"); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access); + + mag_access.set_valid_region(win, _gx->info()->valid_region()); + phase_access.set_valid_region(win, _gx->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _gx, slice); + add_2D_tensor_argument(idx, _gy, slice); + add_2D_tensor_argument(idx, _magnitude, slice); + add_2D_tensor_argument(idx, _phase, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel() + : _magnitude(nullptr), _phase(nullptr), _output(nullptr) +{ +} + +BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const +{ + return BorderSize(1); +} + +void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32); + + _magnitude = magnitude; + _phase = phase; + _output = output; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts)); + + // Set minimum threshold argument + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, lower_thr); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_written_per_iteration = 3; + + Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, + num_elems_read_written_per_iteration, num_elems_read_written_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, mag_access, phase_access, output_access); + + output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _magnitude, slice); + add_2D_tensor_argument(idx, _phase, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLEdgeTraceKernel::CLEdgeTraceKernel() + : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr) +{ +} + +void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, + ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8); + + _input = input; + _output = output; + _lower_thr = lower_thr; + _upper_thr = upper_thr; + _visited = visited; + _recorded = recorded; + _l1_stack = l1_stack; + _l1_stack_counter = l1_stack_counter; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts)); + + // Set constant kernel args + unsigned int width = _input->info()->dimension(0); + unsigned int height = _input->info()->dimension(1); + unsigned int idx = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr)); + _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr)); + _kernel.setArg(idx++, static_cast<cl_uint>(width)); + _kernel.setArg(idx++, static_cast<cl_uint>(height)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration), + output_access, + visited_access, + recorded_access, + l1_stack_access, + l1_stack_counter_access); + + output_access.set_valid_region(win, _input->info()->valid_region()); + visited_access.set_valid_region(win, _input->info()->valid_region()); + recorded_access.set_valid_region(win, _input->info()->valid_region()); + l1_stack_access.set_valid_region(win, _input->info()->valid_region()); + l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _visited, slice); + add_2D_tensor_argument(idx, _recorded, slice); + add_2D_tensor_argument(idx, _l1_stack, slice); + add_2D_tensor_argument(idx, _l1_stack_counter, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp new file mode 100644 index 0000000000..d729ebcfb3 --- /dev/null +++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLChannelCombineKernel::CLChannelCombineKernel() + : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } } +{ +} + +void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); + + const Format fmt = output->info()->format(); + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + if(Format::RGBA8888 == fmt) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8); + _planes[3] = plane3; + } + else + { + _planes[3] = nullptr; + } + _output = output; + _output_multi = nullptr; + + // Half the processed elements for U,V channels due to sub-sampling of 2 + if(Format::YUYV422 == fmt || Format::UYVY422 == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + } + else + { + _x_subsampling = { { 1, 1, 1 } }; + _y_subsampling = { { 1, 1, 1 } }; + } + + // Create kernel + std::string kernel_name = "channel_combine_" + string_from_format(fmt); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(), + plane1->info()->valid_region(), + plane2->info()->valid_region()); + if(plane3 != nullptr) + { + valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region); + } + output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + _planes[3] = nullptr; + _output = nullptr; + _output_multi = output; + bool has_two_planars = false; + + // Set sub-sampling parameters for each plane + const Format fmt = output->info()->format(); + std::string kernel_name; + std::set<std::string> build_opts; + + if(Format::NV12 == fmt || Format::NV21 == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + kernel_name = "channel_combine_NV"; + build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21"); + has_two_planars = true; + } + else + { + if(Format::IYUV == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + } + else + { + _x_subsampling = { { 1, 1, 1 } }; + _y_subsampling = { { 1, 1, 1 } }; + } + + kernel_name = "copy_planes_3p"; + build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + + update_window_and_padding(win, + input_plane0_access, input_plane1_access, input_plane2_access, + output_plane0_access, output_plane1_access, output_plane2_access); + + ValidRegion plane0_valid_region = plane0->info()->valid_region(); + ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region(); + output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + // Subsampling in plane 1 + Window win_sub_plane1(slice); + win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1])); + win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1)); + + // Subsampling in plane 2 + Window win_sub_plane2(slice); + win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2])); + win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1)); + + unsigned int idx = 0; + + // Set inputs + add_2D_tensor_argument(idx, _planes[0], slice); + add_2D_tensor_argument(idx, _planes[1], win_sub_plane1); + add_2D_tensor_argument(idx, _planes[2], win_sub_plane2); + + if(nullptr != _planes[3]) + { + add_2D_tensor_argument(idx, _planes[3], slice); + } + + // Set outputs + if(nullptr != _output) // Single planar output + { + add_2D_tensor_argument(idx, _output, slice); + } + else // Multi-planar output + { + // Reduce slice in case of subsampling to avoid out-of bounds access + slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1)); + + add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice); + add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1); + + if(3 == num_planes_from_format(_output_multi->info()->format())) + { + add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2); + } + + _kernel.setArg(idx++, slice.y().end()); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp new file mode 100644 index 0000000000..541153316a --- /dev/null +++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLChannelExtractKernel::CLChannelExtractKernel() + : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1) +{ +} + +void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output)); + + _input = input; + _output = output; + + // Check format + const Format format = input->info()->format(); + ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel); + + // Create kernel + std::string kernel_name = "channel_extract_" + string_from_format(format); + std::set<std::string> build_opts = { ("-DCHANNEL_" + string_from_channel(channel)) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Half the processed elements for U,V channels due to sub-sampling of 2 + _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1; + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration); + AccessWindowRectangle output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling); + + update_window_and_padding(win, input_access, output_access); + + ValidRegion input_valid_region = input->info()->valid_region(); + output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output)); + + // Get format + const Format fmt = input->info()->format(); + + // Get input plane + const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel)); + ARM_COMPUTE_ERROR_ON(nullptr == input_plane); + + _output = output; + _input = input_plane; + _subsampling = 1; + + // Create kernel + std::string kernel_name; + std::set<std::string> build_opts; + if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt) + { + kernel_name = "copy_plane"; + } + else + { + kernel_name = "channel_extract_" + string_from_format(fmt); + build_opts.insert(("-DCHANNEL_" + string_from_channel(channel))); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input_plane->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + Window win_sub(slice); + win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling)); + win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1)); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, win_sub); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp new file mode 100644 index 0000000000..ad66c39483 --- /dev/null +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cmath> + +using namespace arm_compute; + +CLCol2ImKernel::CLCol2ImKernel() + : _input(nullptr), _output(nullptr), _convolved_dims() +{ +} + +void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + _convolved_dims = convolved_dims; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + _kernel.setArg<cl_uint>(idx++, _convolved_dims.first); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_2D(); + Window slice_out = window.first_slice_window_3D(); + do + { + // Set inputs + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } + while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out)); +} diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp new file mode 100644 index 0000000000..ead2b8f092 --- /dev/null +++ b/src/core/CL/kernels/CLColorConvertKernel.cpp @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <sstream> + +using namespace arm_compute; + +CLColorConvertKernel::CLColorConvertKernel() + : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr) +{ +} + +void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + switch(input->info()->format()) + { + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::RGB888: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + case Format::UYVY422: + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + num_elems_processed_per_iteration = 8; + break; + default: + break; + } + break; + } + case Format::RGB888: + { + switch(output->info()->format()) + { + case Format::RGBA8888: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->format()) + { + case Format::NV12: + case Format::NV21: + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + num_elems_processed_per_iteration = 4; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _multi_input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + const bool has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); + const float sub_sampling = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling, sub_sampling); + AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling, sub_sampling); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + plane0_access, plane1_access, plane2_access, + output_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), + input->plane(2)->info()->valid_region()); + output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + + bool has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); + float sub_sampling = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; + + switch(input->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::NV12: + case Format::IYUV: + num_elems_processed_per_iteration = 2; + break; + case Format::YUV444: + num_elems_processed_per_iteration = 4; + break; + default: + break; + } + break; + } + case Format::UYVY422: + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::NV12: + case Format::IYUV: + num_elems_processed_per_iteration = 8; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _input = input; + _multi_output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444)) + { + win.set_dimension_step(Window::DimY, 2); + } + + AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, + num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_plane0_access, + output_plane1_access, + output_plane2_access); + + ValidRegion input_region = input->info()->valid_region(); + + output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output) +{ + unsigned int num_elems_processed_per_iteration = 0; + switch(input->info()->format()) + { + case Format::NV12: + case Format::NV21: + { + switch(output->info()->format()) + { + case Format::IYUV: + case Format::YUV444: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::YUV444: + case Format::NV12: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _multi_input = input; + _multi_output = output; + + // Create kernel + bool has_two_input_planars = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); + bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); + + float sub_sampling_input = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; + float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling_input, sub_sampling_input); + AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling_input, sub_sampling_input); + AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); + AccessWindowRectangle output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0, + num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); + + update_window_and_padding(win, + input_plane0_access, input_plane1_access, input_plane2_access, + output_plane0_access, output_plane1_access, output_plane2_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), + input->plane(2)->info()->valid_region()); + output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + if(nullptr != _input && nullptr != _output) + { + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _input && nullptr != _multi_output) + { + Format format = _multi_output->info()->format(); + do + { + Window win_uv(slice); + + if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) + { + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + } + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv); + } + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _multi_input && nullptr != _output) + { + Format format = _multi_input->info()->format(); + do + { + Window win_uv(slice); + + if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) + { + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); + + for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv); + } + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _multi_input && nullptr != _multi_output) + { + Format in_format = _multi_input->info()->format(); + Format out_format = _multi_output->info()->format(); + do + { + Window win_in_uv(slice); + if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format)) + { + win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2, + win_in_uv.x().end() / 2, win_in_uv.x().step() / 2)); + win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1)); + } + unsigned int idx = 0; + add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv); + } + + Window win_out_uv(slice); + if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format)) + { + win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2, + win_out_uv.x().end() / 2, win_out_uv.x().step() / 2)); + win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1)); + } + + add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv); + } + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp new file mode 100644 index 0000000000..bdfe398a1d --- /dev/null +++ b/src/core/CL/kernels/CLConvolutionKernel.cpp @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +#define MAX_MATRIX_SIZE 81 + +/****************************************************************************************\ + * Square Convolution * +\****************************************************************************************/ + +template <unsigned int matrix_size> +BorderSize CLConvolutionKernel<matrix_size>::border_size() const +{ + return BorderSize(matrix_size / 2); +} + +template <unsigned int matrix_size> +void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(conv == nullptr); + + _input = input; + _output = output; + + std::stringstream kernel_name; + std::set<std::string> options; + kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static"; + + if(scale == 0) + { + scale = calculate_matrix_scale(conv, matrix_size); + } + + for(unsigned int i = 0; i < matrix_size * matrix_size; i++) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << conv[i]; + options.insert(mat_str.str()); + } + + options.insert("-DSCALE=" + val_to_string(scale)); + + DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size); + options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + + std::stringstream out_type; + out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + options.insert(out_type.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = matrix_size; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +/****************************************************************************************\ + * Separable Convolution * +\****************************************************************************************/ +template <unsigned int matrix_size> +CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel() + : _border_size(0) +{ +} + +template <unsigned int matrix_size> +BorderSize CLSeparableConvolutionHorKernel<matrix_size>::border_size() const +{ + return _border_size; +} + +template <unsigned int matrix_size> +void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32); + + ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2); + + // Set build options + std::set<std::string> build_opts; + + int16_t mat[matrix_size * matrix_size] = { 0 }; + memcpy(mat, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < matrix_size * matrix_size; j++) + { + build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + build_opts.insert("-DSCALE=0"); + + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +template <unsigned int matrix_size> +BorderSize CLSeparableConvolutionVertKernel<matrix_size>::border_size() const +{ + return BorderSize(matrix_size / 2, 0); +} + +template <unsigned int matrix_size> +void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, + const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); + ARM_COMPUTE_ERROR_ON(scale == 0); + + _input = input; + _output = output; + + std::set<std::string> build_opts; + + int16_t mat[matrix_size * matrix_size] = { 0 }; + memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < matrix_size * matrix_size; j++) + { + build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + build_opts.insert("-DSCALE=" + val_to_string(scale)); + + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + + build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type)); + + std::stringstream out_type; + out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + build_opts.insert(out_type.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = matrix_size; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +/****************************************************************************************\ + * Rectangle Convolution * +\****************************************************************************************/ + +CLConvolutionRectangleKernel::CLConvolutionRectangleKernel() + : _border_size(0), _input(nullptr), _output(nullptr) +{ +} + +BorderSize CLConvolutionRectangleKernel::border_size() const +{ + return _border_size; +} + +void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(nullptr == conv); + ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width); + ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height); + ARM_COMPUTE_ERROR_ON(0 == scale); + + _input = input; + _output = output; + _border_size = BorderSize(height / 2, width / 2); + + std::set<std::string> options; + + std::stringstream output_type; + output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + options.insert(output_type.str()); + + uint32_t matrix_size = width * height; + + int16_t mat[MAX_MATRIX_SIZE] = { 0 }; + + memcpy(mat, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++) + { + options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + options.insert("-DSCALE=" + val_to_string(scale)); + + DataType data_type = data_type_for_convolution_matrix(conv, matrix_size); + options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + + options.insert("-DMATRIX_WIDTH=" + val_to_string(width)); + options.insert("-DMATRIX_HEIGHT=" + val_to_string(height)); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + const unsigned int num_rows_read_per_iteration = height; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +template class arm_compute::CLConvolutionKernel<3>; +template class arm_compute::CLConvolutionKernel<5>; +template class arm_compute::CLConvolutionKernel<7>; +template class arm_compute::CLConvolutionKernel<9>; +template class arm_compute::CLSeparableConvolutionVertKernel<5>; +template class arm_compute::CLSeparableConvolutionVertKernel<7>; +template class arm_compute::CLSeparableConvolutionVertKernel<9>; +template class arm_compute::CLSeparableConvolutionHorKernel<5>; +template class arm_compute::CLSeparableConvolutionHorKernel<7>; +template class arm_compute::CLSeparableConvolutionHorKernel<9>; diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp new file mode 100644 index 0000000000..73f1ba15df --- /dev/null +++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLDepthConcatenateKernel::CLDepthConcatenateKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) +{ +} + +BorderSize CLDepthConcatenateKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth")); + + // Configure kernel window + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom * + output->info()->strides_in_bytes()[1]; + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = 4; + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes); + + ICLKernel::configure(win); +} + +void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp new file mode 100644 index 0000000000..24608bd17c --- /dev/null +++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <string> + +using namespace arm_compute; + +void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON(input == output); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different"); + ARM_COMPUTE_ERROR_ON(shift >= 8); + + // Check if convertion is supported + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16 + && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U8 -> [out] U16, S16, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] S16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] U32 -> [out] U8, U16, S16"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] S32 -> [out] U8, U16, S16"); + + // Get data sizes + const size_t input_size = data_size_from_type(input->info()->data_type()); + const size_t output_size = data_size_from_type(output->info()->data_type()); + + // Construct kernel name and build options + std::string kernel_name = "convert_depth"; + std::set<std::string> build_opts; + if(input_size > output_size) + { + kernel_name += "_down"; + build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE"); + } + else + { + kernel_name += "_up"; + } + build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set shift arg + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, shift); + + // Configure kernel + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp new file mode 100644 index 0000000000..36ba06d528 --- /dev/null +++ b/src/core/CL/kernels/CLDerivativeKernel.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLDerivativeKernel::CLDerivativeKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false) +{ +} + +BorderSize CLDerivativeKernel::border_size() const +{ + return BorderSize(1); +} + +void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_derivative_x = output_x != nullptr; + _run_derivative_y = output_y != nullptr; + + if(_run_derivative_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_derivative_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_derivative_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_derivative_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_read_rows_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), 0, 0, 0, 0); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration); + if(_run_derivative_x && _run_derivative_y) + { + input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration); + } + else if(_run_derivative_x) + { + input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration); + } + else if(_run_derivative_y) + { + input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration); + } + + update_window_and_padding(win, + input_access, + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_derivative_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_derivative_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp new file mode 100644 index 0000000000..3abd747011 --- /dev/null +++ b/src/core/CL/kernels/CLDilateKernel.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDilateKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLDilateKernel::border_size() const +{ + return BorderSize(1); +} + +void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate")); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp new file mode 100644 index 0000000000..a7aa88fc5c --- /dev/null +++ b/src/core/CL/kernels/CLErodeKernel.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLErodeKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLErodeKernel::border_size() const +{ + return BorderSize(1); +} + +void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode")); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_pes_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp new file mode 100644 index 0000000000..1d4d776730 --- /dev/null +++ b/src/core/CL/kernels/CLFastCornersKernel.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLFastCornersKernel::CLFastCornersKernel() + : ICLKernel(), _input(nullptr), _output(nullptr) +{ +} + +BorderSize CLFastCornersKernel::border_size() const +{ + return BorderSize(3); +} + +void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented"); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> build_opts; + + if(non_max_suppression) + { + build_opts.emplace("-DUSE_MAXSUPPRESSION"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx, static_cast<float>(threshold)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 7; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3)); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size()); + + ICLKernel::configure(win); +} + +void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLCopyToArrayKernel::CLCopyToArrayKernel() + : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr) +{ +} + +void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(corners == nullptr); + ARM_COMPUTE_ERROR_ON(num_buffers == nullptr); + + _input = input; + _corners = corners; + _num_buffer = num_buffers; + + std::set<std::string> build_opts; + + if(update_number) + { + build_opts.emplace("-DUPDATE_NUMBER"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts)); + + //Get how many pixels skipped in the x dimension in the previous stages + unsigned int offset = _input->info()->valid_region().anchor.x(); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<unsigned int>(idx++, corners->max_num_values()); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg(idx++, *_num_buffer); + _kernel.setArg(idx++, _corners->cl_buffer()); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + //Initialise the _num_buffer as it used as both input and output + static const unsigned int zero_init = 0; + queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp new file mode 100644 index 0000000000..981aad665a --- /dev/null +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstdint> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLFillBorderKernel::CLFillBorderKernel() + : ICLKernel(), _tensor(nullptr) +{ +} + +bool CLFillBorderKernel::is_parallelisable() const +{ + return false; +} + +template <class T> +void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value) +{ + T value; + constant_border_value.get(value); + ICLKernel::add_argument<T>(idx, static_cast<T>(value)); +} + +void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1); + + border_size.limit(tensor->info()->padding()); + + // If there is no border: early exit + if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + { + return; + } + + // Select appropriate kernel + std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode)); + + // Define select type required by replicate border > 1 + const DataType dt = tensor->info()->data_type(); + std::string select_type = get_cl_type_from_data_type(dt); + if(is_data_type_float(dt)) + { + select_type = (DataType::F32 == dt) ? "int" : "short"; + } + + // Define build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace(("-DSELECT_TYPE=" + select_type)); + build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top))); + build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom))); + build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left))); + build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + _tensor = tensor; + + // Create static kernel arguments + const unsigned int valid_width = tensor->info()->valid_region().shape[0]; + const unsigned int valid_height = tensor->info()->valid_region().shape[1]; + const cl_int2 valid_region_coords = + { + { + static_cast<cl_int>(tensor->info()->valid_region().anchor[0]), + static_cast<cl_int>(tensor->info()->valid_region().anchor[1]), + } + }; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters + ICLKernel::add_argument<cl_uint>(idx, valid_width); + ICLKernel::add_argument<cl_uint>(idx, valid_height); + ICLKernel::add_argument<cl_int2>(idx, valid_region_coords); + if(BorderMode::CONSTANT == border_mode) + { + switch(dt) + { + case DataType::U8: + set_constant_border<uint8_t>(idx, constant_border_value); + break; + case DataType::U16: + set_constant_border<uint16_t>(idx, constant_border_value); + break; + case DataType::S16: + set_constant_border<int16_t>(idx, constant_border_value); + break; + case DataType::U32: + set_constant_border<uint32_t>(idx, constant_border_value); + break; + case DataType::S32: + set_constant_border<int32_t>(idx, constant_border_value); + break; + case DataType::F32: + static_assert(sizeof(float) == 4, "Float must be 32 bit"); + set_constant_border<float>(idx, constant_border_value); + break; + case DataType::F16: + static_assert(sizeof(cl_half) == 2, "Half must be 16 bit"); + set_constant_border<cl_half>(idx, constant_border_value); + break; + default: + ARM_COMPUTE_ERROR("Not handled"); + } + } + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + win.use_tensor_dimensions(tensor->info(), Window::DimZ); + ICLKernel::configure(win); +} + +void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) +{ + // Border mode undefined or border width == 0 + if(_kernel() == nullptr) + { + return; + } + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _tensor, slice); + enqueue(queue, *this, slice, cl::NullRange); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..71d42c5606 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f)); + + _input = input; + _output = output; + + // Create kernel + std::string data_type_name; + data_type_name = val_to_string(input->info()->element_size() * 8) + "bit"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] + */ + Window in_slice = window.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + // Change x and y steps for the slide of output tensor + out_slice.scale(Window::DimX, 4.f); + out_slice.scale(Window::DimY, 0.25f); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..c6e05b92a2 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> +#include <cstdint> +#include <tuple> + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, + int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + + _input0 = input0; + _input1 = input1; + _output = output; + + // Create kernel and set static arguments + std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts)); + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<int32_t>(idx++, a_offset); + _kernel.setArg<int32_t>(idx++, b_offset); + _kernel.setArg<int32_t>(idx++, output_offset); + _kernel.setArg<int32_t>(idx++, output_mult_int); + _kernel.setArg<int32_t>(idx++, shift); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + constexpr unsigned int num_elems_read_per_iteration_input0 = 4; + constexpr unsigned int num_elems_read_per_iteration_input1 = 16; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + Window slice_matrix_b = slice; + slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1)); + slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() < 3) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_2D_tensor_argument(idx, _input1, slice_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 0000000000..289873c23f --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + + _biases = biases; + _accum = accum; + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type()); + + Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1)); + AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, biases_access, accum_access); + + ICLKernel::configure(win); +} + +void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window accum_slice = window.first_slice_window_2D(); + + Window biases_slice(accum_slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_2D_tensor_argument(idx, _accum, accum_slice); + add_1D_tensor_argument(idx, _biases, biases_slice); + + enqueue(queue, *this, accum_slice); + } + while(window.slide_window_slice_2D(accum_slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..343838f2f9 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + _input = input; + _output = output; + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + + std::ostringstream ma_arguments; + ma_arguments << "-DBETA=" << beta; + std::set<std::string> build_opts; + build_opts.emplace(ma_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..d7388e8579 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/AccessWindowTranspose.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + if(output->info()->dimension(1) == 1) + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + } + + _input0 = input0; + _input1 = input1; + _output = output; + + if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } + + std::ostringstream mm_arguments; + mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " "; + mm_arguments << "-DALPHA=" << alpha << " "; + std::set<std::string> build_opts; + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if(output->info()->dimension(1) == 1) + { + mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " "; + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts)); + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); + } + else + { + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + + if(data_type_name == "f32") + { + GPUTarget arch_target = get_arch_from_target(get_target()); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts)); + } + else + { + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts)); + } + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); + } +} + +void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + Window slice_matrix_b = slice; + slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1)); + slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() < 3) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_2D_tensor_argument(idx, _input1, slice_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp new file mode 100644 index 0000000000..ecee1abd72 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> + +using namespace arm_compute; + +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t transpose_w = 16 / input->info()->element_size(); + output_shape.set(0, input->info()->dimension(1) * transpose_w); + output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w))))); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + + /* + * Following an example of how the transposition1xW works when the input data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ] + * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ] + */ + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type())); + std::string kernel_name = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + float scale_x = 1.f; + + switch(input->info()->data_type()) + { + case DataType::U8: + scale_x = 16.f; + break; + case DataType::F16: + scale_x = 8.f; + break; + case DataType::F32: + scale_x = 4.f; + break; + default: + // Do nothing + break; + } + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Output is transposed + Window out_window(window); + out_window.set(Window::DimX, window.y()); + out_window.set(Window::DimY, window.x()); + + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, _lws_hint); + } + while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp new file mode 100644 index 0000000000..e5bc3f9656 --- /dev/null +++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLGaussian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Set build options + std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1", + "-DMAT3=2", "-DMAT4=4", "-DMAT5=2", + "-DMAT6=1", "-DMAT7=2", "-DMAT8=1", + "-DSCALE=16", "-DDATA_TYPE_OUT=uchar" + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp new file mode 100644 index 0000000000..bd523c883d --- /dev/null +++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h" + +#include <cstdint> + +using namespace arm_compute; + +void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + const int16_t matrix[] = { 1, 4, 6, 4, 1 }; + + // Set arguments + CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined); +} + +void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + const uint32_t scale = 256; + const int16_t matrix[] = { 1, 4, 6, 4, 1 }; + + // Set arguments + CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined); +} diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp new file mode 100644 index 0000000000..34a228c717 --- /dev/null +++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel() + : _border_size(0), _l2_load_offset(0) +{ +} + +BorderSize CLGaussianPyramidHorKernel::border_size() const +{ + return _border_size; +} + +void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 20; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr float scale_x = 0.5f; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x); + + // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even + // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether + // a pixel is even or odd is determined based on the tensor shape not the + // valid region!) + // Thus the offset from which the first pixel (L2) for the convolution is + // loaded depends on the anchor and shape of the valid region. + // In the case of an even shape (= even image width) we need to load L2 + // from -2 if the anchor is odd and from -1 if the anchor is even. That + // makes sure that L2 is always loaded from an odd pixel. + // On the other hand, for an odd shape (= odd image width) we need to load + // L2 from -1 if the anchor is odd and from -2 if the anchor is even to + // achieve the opposite effect. + // The condition can be simplified to checking whether anchor + shape is + // odd (-2) or even (-1) as only adding an odd and an even number will have + // an odd result. + _l2_load_offset = -border_size().left; + + if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0) + { + _l2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f)); + valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window win_in(window); + win_in.shift(Window::DimX, _l2_load_offset); + + //The output is half the width of the input: + Window win_out(window); + win_out.scale(Window::DimX, 0.5f); + + Window slice_in = win_in.first_slice_window_2D(); + Window slice_out = win_out.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); +} + +CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel() + : _t2_load_offset(0) +{ +} + +BorderSize CLGaussianPyramidVertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_rows_processed_per_iteration = 2; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_per_iteration = 5; + constexpr float scale_y = 0.5f; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), + border_undefined, border_size()); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y); + + // Determine whether we need to load even or odd rows. See above for a + // detailed explanation. + _t2_load_offset = -border_size().top; + + if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0) + { + _t2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f)); + valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(window.x().step() != 8); + ARM_COMPUTE_ERROR_ON(window.y().step() % 2); + + Window win_in(window); + win_in.shift(Window::DimY, _t2_load_offset); + + Window win_out(window); + win_out.scale(Window::DimY, 0.5f); + + Window slice_in = win_in.first_slice_window_2D(); + Window slice_out = win_out.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); +} diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp new file mode 100644 index 0000000000..87659c4ba9 --- /dev/null +++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel() + : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size() +{ +} + +void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX)); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY)); + + _input_magnitude = input_magnitude; + _input_phase = input_phase; + _output = output; + _cell_size = hog_info->cell_size(); + + float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f); + phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f); + + std::stringstream args_str; + args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " "; + args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " "; + args_str << "-DNUM_BINS=" << hog_info->num_bins() << " "; + args_str << "-DPHASE_SCALE=" << phase_scale << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts)); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height; + constexpr unsigned int num_elems_written_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + // Compute slice for the magnitude and phase tensors + Window slice_mag_phase = window.first_slice_window_2D(); + slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width)); + slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height)); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase); + add_2D_tensor_argument(idx, _input_phase, slice_mag_phase); + add_2D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel() + : _input(nullptr), _output(nullptr), _num_cells_per_block_stride() +{ +} + +void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); + + // Number of cells per block + const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width, + hog_info->block_size().height / hog_info->cell_size().height); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32); + + // Number of cells per block stride + const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width, + hog_info->block_stride().height / hog_info->cell_size().height); + + _input = input; + _output = output; + _num_cells_per_block_stride = num_cells_per_block_stride; + + std::stringstream args_str; + args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " "; + args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " "; + args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " "; + args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " "; + args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " "; + args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " "; + args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " "; + args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts)); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = num_cells_per_block.height; + constexpr unsigned int num_elems_written_per_iteration = 1; + const unsigned int num_rows_written_per_iteration = num_cells_per_block.height; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + // Compute slice for the magnitude and phase tensors + Window slice_in = window.first_slice_window_2D(); + slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width); + slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp new file mode 100644 index 0000000000..0f9a98950d --- /dev/null +++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLHOG.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLHOGDetectorKernel::CLHOGDetectorKernel() + : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr) +{ +} + +void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, + float threshold, uint16_t idx_class) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32); + ARM_COMPUTE_ERROR_ON(hog == nullptr); + ARM_COMPUTE_ERROR_ON(detection_windows == nullptr); + ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr); + ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0); + ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0); + + const Size2D &detection_window_size = hog->info()->detection_window_size(); + const Size2D &block_size = hog->info()->block_size(); + const Size2D &block_stride = hog->info()->block_stride(); + + _input = input; + _detection_windows = detection_windows; + _num_detection_windows = num_detection_windows; + + const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels(); + const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1; + + ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size()); + + std::stringstream args_str; + args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " "; + args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " "; + args_str << "-DTHRESHOLD=" << threshold << " "; + args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " "; + args_str << "-DIDX_CLASS=" << idx_class << " "; + args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " "; + args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " "; + args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " "; + args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters + _kernel.setArg(idx++, hog->cl_buffer()); + _kernel.setArg(idx++, detection_windows->cl_buffer()); + _kernel.setArg(idx++, *_num_detection_windows); + + // Get the number of blocks along the x and y directions of the input tensor + const ValidRegion &valid_region = input->info()->valid_region(); + const size_t num_blocks_x = valid_region.shape[0]; + const size_t num_blocks_y = valid_region.shape[1]; + + // Get the number of blocks along the x and y directions of the detection window + const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width; + const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height; + + const size_t window_step_x = detection_window_stride.width / block_stride.width; + const size_t window_step_y = detection_window_stride.height / block_stride.height; + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x)); + win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y)); + + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y; + + update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration)); + + ICLKernel::configure(win); +} + +void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp new file mode 100644 index 0000000000..9fc34a7760 --- /dev/null +++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLHarrisScoreKernel::CLHarrisScoreKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0) +{ +} + +BorderSize CLHarrisScoreKernel::border_size() const +{ + return _border_size; +} + +void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output, + int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); + ARM_COMPUTE_ERROR_ON(0.0f == norm_factor); + + _input1 = input1; + _input2 = input2; + _output = output; + _sensitivity = sensitivity; + _strength_thresh = strength_thresh; + _norm_factor = norm_factor; + _border_size = BorderSize(block_size / 2); + + // Select kernel + std::stringstream harris_score_kernel_name; + harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size; + + // Create build options + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts)); + + // Set static kernel arguments + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, sensitivity); + _kernel.setArg(idx++, strength_thresh); + _kernel.setArg(idx++, norm_factor); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + constexpr unsigned int num_elems_written_per_iteration = 4; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region()); + output_access.set_valid_region(win, valid_region, border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp new file mode 100644 index 0000000000..87ee5fb74e --- /dev/null +++ b/src/core/CL/kernels/CLHistogramKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHistogramKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLDistribution1D.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstring> +#include <string> + +using namespace arm_compute; + +// each thread handle 16 pixels +constexpr signed int pixels_per_item = 16; + +// local work group size in X dimension +constexpr unsigned int local_x_size = 16; + +CLHistogramKernel::CLHistogramKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + // Check input size + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + // Check offset + ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); + + // Check range + ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); + + _input = input; + _output = output; + + if(_input->info()->dimension(0) < pixels_per_item) + { + return; + } + + unsigned int num_bins = _output->num_bins(); + unsigned int window_size = _output->window(); + unsigned int offset = _output->offset(); + unsigned int range = _output->range(); + unsigned int offrange = offset + range; + unsigned int bin_size = _output->size(); + unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions + + // Create kernel + bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); + std::string kernel_name = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, buffer_size, nullptr); + _kernel.setArg(idx++, _output->cl_buffer()); + if(!is_fixed_size) + { + _kernel.setArg<cl_uint>(idx++, num_bins); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg<cl_uint>(idx++, range); + _kernel.setArg<cl_uint>(idx++, offrange); + } + + // We only run histogram on Image, therefore only 2 dimensions here + unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item; + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, end_position, pixels_per_item)); + win.set(1, Window::Dimension(0, _input->info()->dimension(1))); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item)); + + ICLKernel::configure(win); +} + +void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + if(_input->info()->dimension(0) < pixels_per_item) + { + return; + } + + _output->map(queue, true); + ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); + memset(_output->buffer(), 0, _output->size()); + _output->unmap(queue); + + Window slice = window.first_slice_window_2D(); + cl::NDRange lws = cl::NDRange(local_x_size, 1); + + do + { + /* Run the core part which has width can be divided by 16 */ + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice, lws); + } + while(window.slide_window_slice_2D(slice)); +} + +CLHistogramBorderKernel::CLHistogramBorderKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + // Check input size + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + // Check offset + ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); + + // Check range + ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); + + // We only run histogram on Image, therefore only 2 dimensions here + unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item; + + if(start_position >= input->info()->dimension(0)) + { + return; // no need to run histogram border kernel + } + + _input = input; + _output = output; + + unsigned int num_bins = _output->num_bins(); + unsigned int window_size = _output->window(); + unsigned int offset = _output->offset(); + unsigned int range = _output->range(); + unsigned int offrange = offset + range; + + // Create kernel + bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); + std::string kernel_name = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, _output->cl_buffer()); + if(!is_fixed_size) + { + _kernel.setArg<cl_uint>(idx++, num_bins); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg<cl_uint>(idx++, range); + _kernel.setArg<cl_uint>(idx++, offrange); + } + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(start_position, _input->info()->dimension(0))); + win.set(1, Window::Dimension(0, _input->info()->dimension(1))); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1)); + ICLKernel::configure(win); +} + +void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue) +{ + if(window.x().start() >= window.x().end()) + { + return; + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + cl::NDRange lws = cl::NDRange(1, 1); + + Window slice = window.first_slice_window_2D(); + + do + { + /* Run the border part which has width cannot be divided by 16 */ + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice, lws); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp new file mode 100644 index 0000000000..8c0fe26666 --- /dev/null +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cmath> +#include <tuple> + +using namespace arm_compute; + +CLIm2ColKernel::CLIm2ColKernel() + : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr) +{ +} + +void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + + _input = input; + _output = output; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace((has_bias ? "-DHAS_BIAS" : "")); + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = conv_info.pad(); + std::tie(stride_x, stride_y) = conv_info.stride(); + + const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4) + && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)) + && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0)); + + if(!run_img2col_reduced) + { + _convolved_dims = convolved_dims; + _conv_info = conv_info; + _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2)); + _num_elems_processed_per_iteration = output->info()->dimension(0); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts)); + + // Create static kernel arguments + const cl_int2 input_dims = + { + { + static_cast<cl_int>(input->info()->dimension(0)), + static_cast<cl_int>(input->info()->dimension(1)), + } + }; + const cl_int2 strides = + { + { + stride_x, + stride_y, + } + }; + const cl_int2 paddings = + { + { + pad_x, + pad_y, + } + }; + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + _kernel.setArg<cl_int>(idx++, _kernel_size); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */); + _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */); + _kernel.setArg<cl_int2>(idx++, input_dims); + _kernel.setArg<cl_int2>(idx++, strides); + _kernel.setArg<cl_int2>(idx++, paddings); + + _run_func = &CLIm2ColKernel::run_generic; + } + else + { + _num_elems_processed_per_iteration = 1; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts)); + _run_func = &CLIm2ColKernel::run_reduced; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON(_run_func == nullptr); + (this->*_run_func)(window, queue); +} + +void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = _conv_info.pad(); + std::tie(stride_x, stride_y) = _conv_info.stride(); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + Window slice_out = window.first_slice_window_3D(); + + // Setup slice + slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1)); + slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1)); + slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup input slice + // The first three dimensions of the input are increased by the inner loops + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup output slice + slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration)); + slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1)); + slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in)); +} + +void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window out_slice = out_window.first_slice_window_1D(); + Window in_slice = window.first_slice_window_3D(); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1)); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); +} diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp new file mode 100644 index 0000000000..69ede457df --- /dev/null +++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> + +using namespace arm_compute; + +void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal")); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); + const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +CLIntegralImageVertKernel::CLIntegralImageVertKernel() + : _in_out(nullptr) +{ +} + +void CLIntegralImageVertKernel::configure(ICLTensor *in_out) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32); + + _in_out = in_out; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + const unsigned int num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY); + + Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, in_out_access); + + in_out_access.set_valid_region(win, in_out->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const size_t height = _in_out->info()->dimension(1); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _in_out, slice); + _kernel.setArg<cl_uint>(idx++, height); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp new file mode 100644 index 0000000000..12cdd0ec93 --- /dev/null +++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLArray.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> + +using namespace arm_compute; + +void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, + ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal, + bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale) + +{ + ARM_COMPUTE_ERROR_ON(old_points == nullptr); + ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + + const float scale = std::pow(pyramid_scale, level); + + // Create kernel + std::string kernel_name = "init_level"; + if(level == (num_levels - 1)) + { + kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max"); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = 0; + if(level == (num_levels - 1)) + { + _kernel.setArg(idx++, old_points->cl_buffer()); + if(use_initial_estimate) + { + _kernel.setArg(idx++, new_points_estimates->cl_buffer()); + } + } + _kernel.setArg(idx++, old_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg<cl_float>(idx++, scale); + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + ICLKernel::configure(window); +} + +void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + enqueue(queue, *this, window); +} + +void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points) + +{ + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points == nullptr); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize")); + + // Set static kernel arguments + unsigned int idx = 0; + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points->cl_buffer()); + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + ICLKernel::configure(window); +} + +void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + enqueue(queue, *this, window); +} + +CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel() + : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr) +{ +} + +void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy, + ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal, + ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, + size_t window_dimension, size_t level) + +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(coeff_table == nullptr); + ARM_COMPUTE_ERROR_ON(old_ival == nullptr); + + _old_input = old_input; + _old_scharr_gx = old_scharr_gx; + _old_scharr_gy = old_scharr_gy; + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const ValidRegion valid_region = intersect_valid_regions( + old_input->info()->valid_region(), + old_scharr_gx->info()->valid_region(), + old_scharr_gy->info()->valid_region()); + + update_window_and_padding(window, + AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1)), + AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1)), + AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1))); + + ICLKernel::configure(window); + + // Initialize required variables + const int level0 = (level == 0) ? 1 : 0; + const int window_size = window_dimension; + const int window_size_squared = window_dimension * window_dimension; + const int window_size_half = window_dimension / 2; + const float eig_const = 1.0f / (2.0f * window_size_squared); + const cl_float3 border_limits = + { + { + // -1 because we load 2 values at once for bilinear interpolation + static_cast<cl_float>(valid_region.end(0) - window_size - 1), + static_cast<cl_float>(valid_region.end(1) - window_size - 1), + static_cast<cl_float>(valid_region.start(0)) + } + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0")); + + // Set arguments + unsigned int idx = 3 * num_arguments_per_2D_tensor(); + _kernel.setArg(idx++, old_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, coeff_table->cl_buffer()); + _kernel.setArg(idx++, old_ival->cl_buffer()); + _kernel.setArg<cl_int>(idx++, window_size); + _kernel.setArg<cl_int>(idx++, window_size_squared); + _kernel.setArg<cl_int>(idx++, window_size_half); + _kernel.setArg<cl_float3>(idx++, border_limits); + _kernel.setArg<cl_float>(idx++, eig_const); + _kernel.setArg<cl_int>(idx++, level0); +} + +void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Set static tensor arguments. Setting here as allocation might be deferred. + unsigned int idx = 0; + add_2D_tensor_argument(idx, _old_input, window); + add_2D_tensor_argument(idx, _old_scharr_gx, window); + add_2D_tensor_argument(idx, _old_scharr_gy, window); + + enqueue(queue, *this, window); +} + +CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel() + : _new_input(nullptr) +{ +} + +void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, + Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level) + +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(coeff_table == nullptr); + ARM_COMPUTE_ERROR_ON(old_ival == nullptr); + + _new_input = new_input; + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const ValidRegion &valid_region = new_input->info()->valid_region(); + + update_window_and_padding(window, + AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1))); + + ICLKernel::configure(window); + + // Initialize required variables + const int level0 = (level == 0) ? 1 : 0; + const int window_size = window_dimension; + const int window_size_squared = window_dimension * window_dimension; + const int window_size_half = window_dimension / 2; + const float eig_const = 1.0f / (2.0f * window_size_squared); + const cl_float3 border_limits = + { + { + // -1 because we load 2 values at once for bilinear interpolation + static_cast<cl_float>(valid_region.end(0) - window_size - 1), + static_cast<cl_float>(valid_region.end(1) - window_size - 1), + static_cast<cl_float>(valid_region.start(0)) + } + }; + const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0; + const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1")); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, coeff_table->cl_buffer()); + _kernel.setArg(idx++, old_ival->cl_buffer()); + _kernel.setArg<cl_int>(idx++, window_size); + _kernel.setArg<cl_int>(idx++, window_size_squared); + _kernel.setArg<cl_int>(idx++, window_size_half); + _kernel.setArg<cl_int>(idx++, num_iterations); + _kernel.setArg<cl_float>(idx++, epsilon); + _kernel.setArg<cl_float3>(idx++, border_limits); + _kernel.setArg<cl_float>(idx++, eig_const); + _kernel.setArg<cl_int>(idx++, level0); + _kernel.setArg<cl_int>(idx++, term_iteration); + _kernel.setArg<cl_int>(idx++, term_epsilon); +} + +void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Set static tensor arguments. Setting here as allocation might be deferred. + unsigned int idx = 0; + add_2D_tensor_argument(idx, _new_input, window); + + enqueue(queue, *this, window); +} diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..794a1bc56e --- /dev/null +++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + + _input0 = input0; + _input1 = input1; + _output = output; + + if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } + + std::ostringstream mm_arguments; + std::set<std::string> build_opts; + + mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " "; + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts)); + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + Window matrix_b_window; + matrix_b_window.use_tensor_dimensions(_input1->info()); + Window slice_matrix_b = matrix_b_window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_3D_tensor_argument(idx, _input1, slice_matrix_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp new file mode 100644 index 0000000000..c504189169 --- /dev/null +++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLMagnitudePhaseKernel::CLMagnitudePhaseKernel() + : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false) +{ +} + +void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, + MagnitudeType mag_type, PhaseType phase_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy); + + _run_mag = (magnitude != nullptr); + _run_phase = (phase != nullptr); + if(_run_mag) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude); + } + if(_run_phase) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + } + + if(!_run_mag && !_run_phase) + { + ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); + } + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + // Construct kernel name + std::set<std::string> build_opts = {}; + + // Add magnitude type + if(_run_mag) + { + switch(mag_type) + { + case MagnitudeType::L1NORM: + build_opts.insert("-DMAGNITUDE=1"); + break; + case MagnitudeType::L2NORM: + build_opts.insert("-DMAGNITUDE=2"); + break; + default: + ARM_COMPUTE_ERROR("Unsupported magnitude calculation type."); + build_opts.insert("-DMAGNITUDE=0"); + break; + } + } + + // Add phase type + if(_run_phase) + { + switch(phase_type) + { + case PhaseType::UNSIGNED: + build_opts.insert("-DPHASE=1"); + break; + case PhaseType::SIGNED: + build_opts.insert("-DPHASE=2"); + break; + default: + ARM_COMPUTE_ERROR("Unsupported phase calculation type."); + build_opts.insert("-DPHASE=0"); + break; + } + } + + // Add data_type + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + gx_access, gy_access, + output_magnitude_access, output_phase_access); + + ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(), + gy->info()->valid_region()); + output_magnitude_access.set_valid_region(win, valid_region); + output_phase_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _gx, slice); + add_2D_tensor_argument(idx, _gy, slice); + + if(_run_mag) + { + add_2D_tensor_argument(idx, _magnitude, slice); + } + + if(_run_phase) + { + add_2D_tensor_argument(idx, _phase, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp new file mode 100644 index 0000000000..b0b748f466 --- /dev/null +++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <set> +#include <string> + +using namespace arm_compute; + +CLMeanStdDevKernel::CLMeanStdDevKernel() + : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr) +{ +} + +void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == mean); + ARM_COMPUTE_ERROR_ON(nullptr == global_sum); + ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared); + + _input = input; + _mean = mean; + _stddev = stddev; + _global_sum = global_sum; + _global_sum_squared = global_sum_squared; + + // Create kernel + std::set<std::string> build_opts; + + if(_stddev != nullptr) + { + build_opts.insert("-DSTDDEV"); + } + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts)); + + // Set fixed arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters + + _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1))); + _kernel.setArg(idx++, *_global_sum); + + if(_stddev != nullptr) + { + _kernel.setArg(idx++, *_global_sum_squared); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + update_window_and_padding(win, input_access); + + ICLKernel::configure(win); +} + +void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Clear sums + static const cl_ulong zero = 0; + queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero); + + if(_stddev != nullptr) + { + queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero); + } + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + // Set slice step equal to height to force gws[1] to 1, + // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item + slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + + // Calculate mean and stddev + cl_ulong global_sum = 0; + cl_ulong global_sum_squared = 0; + const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1); + + queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum)); + const float mean = global_sum / num_pixels; + *_mean = mean; + + if(_stddev != nullptr) + { + queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared)); + *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean)); + } +} diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp new file mode 100644 index 0000000000..95334c7b5f --- /dev/null +++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLMedian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" })); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp new file mode 100644 index 0000000000..939a53b03a --- /dev/null +++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <climits> + +using namespace arm_compute; + +CLMinMaxKernel::CLMinMaxKernel() + : _input(nullptr), _min_max(), _data_type_max_min() +{ +} + +void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(min_max == nullptr); + + _input = input; + _min_max = min_max; + const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); + + switch(input->info()->data_type()) + { + case DataType::U8: + _data_type_max_min[0] = UCHAR_MAX; + _data_type_max_min[1] = 0; + break; + case DataType::S16: + _data_type_max_min[0] = SHRT_MAX; + _data_type_max_min[1] = SHRT_MIN; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong image data types"); + } + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0])); + build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1])); + build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : ""); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts)); + + // Set fixed arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, *_min_max); + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Reset mininum and maximum values + queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data()); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLMinMaxLocationKernel::CLMinMaxLocationKernel() + : _input(nullptr), _min_max_count(nullptr) +{ +} + +void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(min_max == nullptr); + ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr); + + _input = input; + _min_max_count = min_max_count; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : ""); + build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : ""); + build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : ""); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts)); + + // Set static arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, *min_max); + _kernel.setArg(idx++, *min_max_count); + if(min_loc != nullptr) + { + _kernel.setArg(idx++, min_loc->cl_buffer()); + _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values()); + } + if(max_loc != nullptr) + { + _kernel.setArg(idx++, max_loc->cl_buffer()); + _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values()); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + static const unsigned int zero_count = 0; + queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count); + queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp new file mode 100644 index 0000000000..6afa5822ba --- /dev/null +++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLNonLinearFilterKernel::CLNonLinearFilterKernel() + : _border_size(0) +{ +} + +BorderSize CLNonLinearFilterKernel::border_size() const +{ + return _border_size; +} + +void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, + unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5); + ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!"); + ARM_COMPUTE_UNUSED(mask); + + _input = input; + _output = output; + _border_size = BorderSize(mask_size / 2); + + // Define build options + std::set<std::string> build_opts; + build_opts.emplace("-D" + string_from_non_linear_filter_function(function)); + + // Define kernel + std::string pattern_name = string_from_matrix_pattern(pattern); + std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower); + std::stringstream ss; + ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + const unsigned int num_rows_read_per_iteration = mask_size; + + Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp new file mode 100644 index 0000000000..6a96b0effd --- /dev/null +++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); + + _input = input; + _output = output; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..106a5113db --- /dev/null +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLNormalizationLayerKernel::CLNormalizationLayerKernel() + : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0) +{ +} + +BorderSize CLNormalizationLayerKernel::border_size() const +{ + return _border_size; +} + +void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); + ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented"); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + _input = input; + _squared_input = squared_input; + _output = output; + + const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D); + const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0; + _border_size = BorderSize(0, border_width); + + // Create kernel + std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set kernel static arguments + unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff()); + _kernel.setArg<cl_float>(idx++, norm_info.beta()); + _kernel.setArg<cl_float>(idx++, norm_info.kappa()); + _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1; + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, squared_input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _squared_input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp new file mode 100644 index 0000000000..84eb434bc9 --- /dev/null +++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstdlib> +#include <set> +#include <string> + +using namespace arm_compute; + +CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. "); + + _input1 = input1; + _input2 = input2; + _output = output; + + int scale_int = -1; + // Extract sign, exponent and mantissa + int exponent = 0; + float normalized_mantissa = std::frexp(scale, &exponent); + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + scale_int = std::abs(exponent - 1); + } + + std::string data_type; + std::string compute_type; + // Check if it has float inputs and output + if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type())) + { + scale_int = -1; + compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half"; + data_type = "DATA_TYPE_FLOAT"; + } + else + { + compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort"; + data_type = "DATA_TYPE_INT"; + } + + // Construct kernel name + std::string kernel_name = "pixelwise_mul"; + kernel_name += (scale_int >= 0) ? "_int" : "_float"; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); + build_opts.emplace("-D" + data_type); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set scale argument + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters + + if(scale_int >= 0) + { + _kernel.setArg(idx++, scale_int); + } + else + { + _kernel.setArg(idx++, scale); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp new file mode 100644 index 0000000000..dc5ae4ec7a --- /dev/null +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> +#include <tuple> + +using namespace arm_compute; + +CLPoolingLayerKernel::CLPoolingLayerKernel() + : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0) +{ +} + +BorderSize CLPoolingLayerKernel::border_size() const +{ + return _border_size; +} + +void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info) +{ + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + const PoolingType pool_type = pool_info.pool_type(); + const int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + DimensionRoundingType pool_round = pad_stride_info.round(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size); + ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), + input->info()->dimension(1), + pool_size, + pool_stride_x, pool_stride_y, + pool_pad_x, pool_pad_y, + pool_round); + ARM_COMPUTE_UNUSED(pooled_w); + ARM_COMPUTE_UNUSED(pooled_h); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); + + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + // Set instance variables + _input = input; + _output = output; + _pool_info = pool_info; + _border_size = BorderSize(pool_pad_y, pool_pad_x); + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG")))); + + // Create kernel + std::string kernel_name = "pooling_layer_" + val_to_string(pool_size); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set static kernel arguments + if(pool_type == PoolingType::AVG) + { + // Create static kernel arguments + const cl_int2 max_dims = + { + { + static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x, + static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y, + } + }; + const cl_int2 strides = + { + { + pool_stride_x, + pool_stride_y, + } + }; + const cl_int2 paddings = + { + { + pool_pad_x, + pool_pad_y, + } + }; + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + _kernel.setArg<cl_int2>(idx++, max_dims); + _kernel.setArg<cl_int2>(idx++, strides); + _kernel.setArg<cl_int2>(idx++, paddings); + } + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + Window slice = window.first_slice_window_3D(); + + do + { + // Upsample input by pool size + Window in_slice(slice); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x)); + in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y)); + + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp new file mode 100644 index 0000000000..e63a5ef7c6 --- /dev/null +++ b/src/core/CL/kernels/CLRemapKernel.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLRemapKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <algorithm> + +using namespace arm_compute; + +CLRemapKernel::CLRemapKernel() + : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr) +{ +} + +BorderSize CLRemapKernel::border_size() const +{ + return BorderSize(1); +} + +void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!"); + + _input = input; + _output = output; + _map_x = map_x; + _map_y = map_y; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "remap_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 4; + const int border_offset = (border_undefined) ? 0 : border_size().left; + + Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowStatic input_access(output->info(), -border_offset, -border_offset, + _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset); + AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); + + // Set static arguments + unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_float>(idx++, input->info()->dimension(1)); +} + +void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _map_x, slice); + add_2D_tensor_argument(idx, _map_y, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp new file mode 100644 index 0000000000..d74e837ace --- /dev/null +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLScaleKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLScaleKernel::border_size() const +{ + return BorderSize(1); +} + +void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + /* Compute the ratio between source width/height and destination width/height */ + const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0)); + const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1)); + + /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */ + if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + { + policy = InterpolationPolicy::NEAREST_NEIGHBOR; + } + else + { + ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA); + } + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "scale_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + const int border_offset = (border_undefined) ? 0 : border_size().left; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input->info(), -border_offset, -border_offset, + input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<float>(idx++, input->info()->dimension(0)); + _kernel.setArg<float>(idx++, input->info()->dimension(1)); + _kernel.setArg<float>(idx++, output->info()->dimension(0)); + _kernel.setArg<float>(idx++, output->info()->dimension(1)); +} diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp new file mode 100644 index 0000000000..913ef592d4 --- /dev/null +++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLScharr3x3Kernel::CLScharr3x3Kernel() + : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr) +{ +} + +BorderSize CLScharr3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_scharr_x = output_x != nullptr; + _run_scharr_y = output_y != nullptr; + + if(_run_scharr_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_scharr_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_scharr_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_scharr_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_scharr_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_scharr_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp new file mode 100644 index 0000000000..436aaa498a --- /dev/null +++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel3x3Kernel::CLSobel3x3Kernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp new file mode 100644 index 0000000000..4c0316f19e --- /dev/null +++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel5x5HorKernel::CLSobel5x5HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize CLSobel5x5HorKernel::border_size() const +{ + return _border_size; +} + +void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLSobel5x5VertKernel::CLSobel5x5VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel5x5VertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts)); + + const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 5; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _input_x, slice); + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _input_y, slice); + add_2D_tensor_argument(idx, _output_y, slice); + } + + _kernel.setArg(idx++, 0 /*dummy*/); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp new file mode 100644 index 0000000000..a477953cfb --- /dev/null +++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel7x7HorKernel::CLSobel7x7HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize CLSobel7x7HorKernel::border_size() const +{ + return _border_size; +} + +void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 3, 3); + + // Construct kernel name + std::string kernel_name = "sobel_separable1x7"; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLSobel7x7VertKernel::CLSobel7x7VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel7x7VertKernel::border_size() const +{ + return BorderSize(3, 0); +} + +void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts)); + + const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 7; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _input_x, slice); + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _input_y, slice); + add_2D_tensor_argument(idx, _output_y, slice); + } + + _kernel.setArg(idx++, 0 /*dummy*/); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp new file mode 100644 index 0000000000..0470d5243e --- /dev/null +++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + // The kernel loops over all elements in steps of 16 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16); + + // Set build options + std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) }; + + // Tell the kernel that the width is not a multiple of 16 + if((input->info()->dimension(0) % max_cl_vector_width) != 0) + { + build_opts.emplace("-DNON_MULTIPLE_OF_16"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts)); + + // Set fixed arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure kernel window + constexpr unsigned int num_elems_written_per_iteration = 1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel() + : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr) +{ +} + +void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum); + + _input = input; + _max = max; + _output = output; + _sum = sum; + + // The kernel loops over all elements in steps of 16 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16); + + // Set build options + std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) }; + + // Tell the kernel that the width is not a multiple of 16 + if((input->info()->dimension(0) % max_cl_vector_width) != 0) + { + build_opts.emplace("-DNON_MULTIPLE_OF_16"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts)); + + // Set fixed arguments + unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal max_access(max->info(), 0, 1); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal sum_access(sum->info(), 0, 1); + + update_window_and_padding(win, input_access, max_access, output_access, sum_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _max, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _sum, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLLogits1DNormKernel::CLLogits1DNormKernel() + : _input(nullptr), _sum(nullptr), _output(nullptr) +{ +} + +void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum); + + _input = input; + _sum = sum; + _output = output; + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type()))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + Window sum_slice = slice; + sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _sum, sum_slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp new file mode 100644 index 0000000000..bbdaa37410 --- /dev/null +++ b/src/core/CL/kernels/CLTableLookupKernel.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLLut.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cstdint> +#include <string> + +using namespace arm_compute; + +void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(lut == nullptr); + ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + // Create kernel + std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set lut argument + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, lut->cl_buffer()); + if(DataType::S16 == lut->type()) + { + _kernel.setArg(idx++, lut->index_offset()); + _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements())); + } + + // Configure kernel + constexpr unsigned int num_elems_processed_per_iteration = 8; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp new file mode 100644 index 0000000000..6e07cefc77 --- /dev/null +++ b/src/core/CL/kernels/CLThresholdKernel.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLThresholdKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <string> + +using namespace arm_compute; + +void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, + uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Construct kernel name + std::string kernel_name = "threshold"; + + switch(type) + { + case ThresholdType::BINARY: + kernel_name += "_binary"; + break; + case ThresholdType::RANGE: + kernel_name += "_range"; + break; + default: + ARM_COMPUTE_ERROR("Thresholding type not recognized"); + break; + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, false_value); + _kernel.setArg(idx++, true_value); + _kernel.setArg(idx++, threshold); + + if(ThresholdType::RANGE == type) + { + _kernel.setArg(idx++, upper); + } + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp new file mode 100644 index 0000000000..2ee6fcb9dc --- /dev/null +++ b/src/core/CL/kernels/CLTransposeKernel.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t w_out = input->info()->dimension(1); + const size_t h_out = input->info()->dimension(0); + output_shape.set(0, w_out); + output_shape.set(1, h_out); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + _lws_hint = cl::NDRange(2, 8); + + std::set<std::string> build_opts; + std::ostringstream data_type_in_bytes; + data_type_in_bytes << input->info()->element_size(); + build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size(); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp new file mode 100644 index 0000000000..e549dbc258 --- /dev/null +++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +namespace +{ +void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size) +{ + for(size_t i = 0; i < size; ++i) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << matrix[i] << " "; + options.insert(mat_str.str()); + } +} +} // namespace + +BorderSize CLWarpAffineKernel::border_size() const +{ + return BorderSize(1); +} + +void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> options; + options_add_matrix(options, matrix, 6); + options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + // Create kernel + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "warp_affine_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_int>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(1)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1)); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp new file mode 100644 index 0000000000..fddb580750 --- /dev/null +++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +namespace +{ +inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size) +{ + for(size_t i = 0; i < size; ++i) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << matrix[i] << " "; + options.insert(mat_str.str()); + } +} +} // namespace + +BorderSize CLWarpPerspectiveKernel::border_size() const +{ + return BorderSize(1); +} + +void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> options; + options_add_matrix(options, matrix, 9); + options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + // Create kernel + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "warp_perspective_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_int>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(1)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1)); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..018f272921 --- /dev/null +++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared) + : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr) +{ +} + +void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + if(_is_shared) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2))); + ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5); + ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3); + } + else + { + ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2); + } + + // Check biases + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); + } + + _biases = biases; + _output = output; + _input = input; + + // Create build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : "")); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts)); + + // Set static arguments + unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); + idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0; + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel() + : CLWeightsReshapeKernel(false) +{ +} + +void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + if(_biases != nullptr) + { + Window biases_slice; + biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1)); + add_1D_tensor_argument(idx, _biases, biases_slice); + } + + // Run kernel + enqueue(queue, *this, in_slice); +} + +CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel() + : CLWeightsReshapeKernel(true) +{ +} + +void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + Window biases_window; + Window biases_slice; + + if(_biases != nullptr) + { + biases_window.use_tensor_dimensions(_biases->info()); + biases_slice = biases_window.first_slice_window_1D(); + } + + do + { + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + if(_biases != nullptr) + { + add_1D_tensor_argument(idx, _biases, biases_slice); + biases_window.slide_window_slice_1D(biases_slice); + } + + // Run kernel + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} |