diff options
Diffstat (limited to 'src/core/CL')
130 files changed, 23403 insertions, 0 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp new file mode 100644 index 0000000000..21b72ddd3b --- /dev/null +++ b/src/core/CL/CLHelpers.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLTypes.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Types.h" + +#include <map> +#include <vector> + +namespace +{ +arm_compute::GPUTarget get_bifrost_target(const std::string &name) +{ + arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD; + + if(name == "G7") + { + target = arm_compute::GPUTarget::G70; + } + + return target; +} + +arm_compute::GPUTarget get_midgard_target(const std::string &name) +{ + arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD; + + if(name == "T6") + { + target = arm_compute::GPUTarget::T600; + } + else if(name == "T7") + { + target = arm_compute::GPUTarget::T700; + } + else if(name == "T8") + { + target = arm_compute::GPUTarget::T800; + } + + return target; +} +} // namespace + +namespace arm_compute +{ +std::string get_cl_type_from_data_type(const DataType &dt) +{ + switch(dt) + { + case DataType::U8: + return "uchar"; + case DataType::S8: + return "char"; + case DataType::U16: + return "ushort"; + case DataType::S16: + return "short"; + case DataType::U32: + return "uint"; + case DataType::S32: + return "int"; + case DataType::U64: + return "ulong"; + case DataType::S64: + return "long"; + case DataType::F16: + return "half"; + case DataType::F32: + return "float"; + default: + ARM_COMPUTE_ERROR("Unsupported input data type."); + return ""; + } +} + +const std::string &string_from_target(GPUTarget target) +{ + static std::map<GPUTarget, const std::string> gpu_target_map = + { + { GPUTarget::MIDGARD, "midgard" }, + { GPUTarget::BIFROST, "bifrost" }, + { GPUTarget::T600, "t600" }, + { GPUTarget::T700, "t700" }, + { GPUTarget::T800, "t800" }, + { GPUTarget::G70, "g70" } + }; + + return gpu_target_map[target]; +} + +GPUTarget get_target_from_device(cl::Device &device) +{ + const std::string name_mali("Mali-"); + GPUTarget target{ GPUTarget::MIDGARD }; + + size_t name_size = 0; + std::vector<char> name; + + // Query device name size + cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size); + ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information"); + // Resize vector + name.resize(name_size); + // Query device name + err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information"); + ARM_COMPUTE_UNUSED(err); + + std::string name_str(name.begin(), name.end()); + auto pos = name_str.find(name_mali); + + if(pos != std::string::npos) + { + ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected."); + std::string sub_name = name_str.substr(pos + name_mali.size(), 2); + + if(sub_name[0] == 'G') + { + target = get_bifrost_target(sub_name); + } + else if(sub_name[0] == 'T') + { + target = get_midgard_target(sub_name); + } + else + { + ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one."); + } + } + else + { + ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one."); + } + + return target; +} + +GPUTarget get_arch_from_target(GPUTarget target) +{ + return (target & GPUTarget::GPU_ARCH_MASK); +} +} // namespace arm_compute diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 0000000000..15a5d90835 --- /dev/null +++ b/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLKernelLibrary.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +Program::Program() + : _context(), _device(), _is_binary(false), _name(), _source(), _binary() +{ +} + +Program::Program(cl::Context context, std::string name, std::string source) + : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary() +{ +} + +Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary) + : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary)) +{ +} + +Program::operator cl::Program() const +{ + if(_is_binary) + { + return cl::Program(_context, { _device }, { _binary }); + } + else + { + return cl::Program(_context, _source, false); + } +} + +bool Program::build(const cl::Program &program, const std::string &build_options) +{ + try + { + return program.build(build_options.c_str()) == CL_SUCCESS; + } + catch(const cl::Error &e) + { + cl_int err = CL_SUCCESS; + const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err); + + for(auto &pair : build_info) + { + std::cerr << pair.second << std::endl; + } + + return false; + } +} + +cl::Program Program::build(const std::string &build_options) const +{ + cl::Program cl_program = static_cast<cl::Program>(*this); + build(cl_program, build_options); + return cl_program; +} + +Kernel::Kernel() + : _name(), _kernel() +{ +} + +Kernel::Kernel(std::string name, const cl::Program &program) + : _name(std::move(name)), + _kernel(cl::Kernel(program, _name.c_str())) +{ +} + +const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = +{ + { "absdiff", "absdiff.cl" }, + { "accumulate", "accumulate.cl" }, + { "accumulate_squared", "accumulate.cl" }, + { "accumulate_weighted", "accumulate.cl" }, + { "activation_layer", "activation_layer.cl" }, + { "arithmetic_add", "arithmetic_op.cl" }, + { "arithmetic_sub", "arithmetic_op.cl" }, + { "bitwise_or", "bitwise_op.cl" }, + { "bitwise_and", "bitwise_op.cl" }, + { "bitwise_xor", "bitwise_op.cl" }, + { "bitwise_not", "bitwise_op.cl" }, + { "channel_combine_NV", "channel_combine.cl" }, + { "channel_combine_RGB888", "channel_combine.cl" }, + { "channel_combine_RGBA8888", "channel_combine.cl" }, + { "channel_combine_UYVY422", "channel_combine.cl" }, + { "channel_combine_YUYV422", "channel_combine.cl" }, + { "channel_extract_NV12", "channel_extract.cl" }, + { "channel_extract_NV21", "channel_extract.cl" }, + { "channel_extract_RGB888", "channel_extract.cl" }, + { "channel_extract_RGBA8888", "channel_extract.cl" }, + { "channel_extract_UYVY422", "channel_extract.cl" }, + { "channel_extract_YUYV422", "channel_extract.cl" }, + { "combine_gradients_L1", "canny.cl" }, + { "combine_gradients_L2", "canny.cl" }, + { "concatenate_depth", "concatenate.cl" }, + { "convolution_rectangle", "convolution_rectangle.cl" }, + { "col2im", "convolution_layer.cl" }, + { "convolution3x3_static", "convolution3x3.cl" }, + { "convolution5x5_static", "convolution5x5.cl" }, + { "convolution7x7_static", "convolution7x7.cl" }, + { "convolution9x9_static", "convolution9x9.cl" }, + { "convolution_separable1x5_static", "convolution5x5.cl" }, + { "convolution_separable5x1_static", "convolution5x5.cl" }, + { "convolution_separable1x7_static", "convolution7x7.cl" }, + { "convolution_separable7x1_static", "convolution7x7.cl" }, + { "convolution_separable1x9_static", "convolution9x9.cl" }, + { "convolution_separable9x1_static", "convolution9x9.cl" }, + { "convert_depth_down", "depth_convert.cl" }, + { "convert_depth_up", "depth_convert.cl" }, + { "copy_plane", "channel_extract.cl" }, + { "copy_planes_3p", "channel_combine.cl" }, + { "copy_to_keypoint", "fast_corners.cl" }, + { "derivative", "derivative.cl" }, + { "dilate", "dilate.cl" }, + { "erode", "erode.cl" }, + { "fast_corners", "fast_corners.cl" }, + { "fill_image_borders_constant", "fill_border.cl" }, + { "fill_image_borders_replicate", "fill_border.cl" }, + { "finalize", "optical_flow_pyramid_lk.cl" }, + { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, + { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, + { "gemm_accumulate_biases_f16", "gemm.cl" }, + { "gemm_accumulate_biases_f32", "gemm.cl" }, + { "gemm_interleave4x4_8bit", "gemm.cl" }, + { "gemm_interleave4x4_16bit", "gemm.cl" }, + { "gemm_interleave4x4_32bit", "gemm.cl" }, + { "gemm_ma_f16", "gemm.cl" }, + { "gemm_ma_f32", "gemm.cl" }, + { "gemm_mm_u8", "gemm.cl" }, + { "gemm_mm_f16", "gemm.cl" }, + { "gemm_mm_f32_midgard", "gemm.cl" }, + { "gemm_mm_f32_bifrost", "gemm.cl" }, + { "gemm_vm_f16", "gemm.cl" }, + { "gemm_vm_f32", "gemm.cl" }, + { "gemm_lc_vm_f32", "gemm.cl" }, + { "gemm_transpose1x16_u8", "gemm.cl" }, + { "gemm_transpose1x8_f16", "gemm.cl" }, + { "gemm_transpose1x4_f32", "gemm.cl" }, + { "harris_score_3x3", "harris_corners.cl" }, + { "harris_score_5x5", "harris_corners.cl" }, + { "harris_score_7x7", "harris_corners.cl" }, + { "hist_border_kernel", "histogram.cl" }, + { "hist_border_kernel_fixed", "histogram.cl" }, + { "hist_local_kernel", "histogram.cl" }, + { "hist_local_kernel_fixed", "histogram.cl" }, + { "hog_block_normalization", "hog.cl" }, + { "hog_detector", "hog.cl" }, + { "hog_orientation_binning", "hog.cl" }, + { "hysteresis", "canny.cl" }, + { "im2col_generic", "convolution_layer.cl" }, + { "im2col_reduced", "convolution_layer.cl" }, + { "init_level", "optical_flow_pyramid_lk.cl" }, + { "init_level_max", "optical_flow_pyramid_lk.cl" }, + { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" }, + { "integral_horizontal", "integral_image.cl" }, + { "integral_vertical", "integral_image.cl" }, + { "IYUV_to_NV12_bt709", "color_convert.cl" }, + { "IYUV_to_RGB888_bt709", "color_convert.cl" }, + { "IYUV_to_RGBA8888_bt709", "color_convert.cl" }, + { "IYUV_to_YUV444_bt709", "color_convert.cl" }, + { "lktracker_stage0", "optical_flow_pyramid_lk.cl" }, + { "lktracker_stage1", "optical_flow_pyramid_lk.cl" }, + { "magnitude_phase", "magnitude_phase.cl" }, + { "mean_stddev_accumulate", "mean_stddev.cl" }, + { "minmax", "minmaxloc.cl" }, + { "minmax_border", "minmaxloc.cl" }, + { "minmaxloc", "minmaxloc.cl" }, + { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" }, + { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" }, + { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" }, + { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" }, + { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" }, + { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" }, + { "non_max_suppression", "nonmax.cl" }, + { "normalization_layer_cross_map", "normalization_layer.cl" }, + { "normalization_layer_in_map_1D", "normalization_layer.cl" }, + { "batchnormalization_layer", "batchnormalization_layer.cl" }, + { "NV12_to_IYUV_bt709", "color_convert.cl" }, + { "NV12_to_RGB888_bt709", "color_convert.cl" }, + { "NV12_to_RGBA8888_bt709", "color_convert.cl" }, + { "NV12_to_YUV444_bt709", "color_convert.cl" }, + { "NV21_to_IYUV_bt709", "color_convert.cl" }, + { "NV21_to_RGB888_bt709", "color_convert.cl" }, + { "NV21_to_RGBA8888_bt709", "color_convert.cl" }, + { "NV21_to_YUV444_bt709", "color_convert.cl" }, + { "pixelwise_mul_float", "pixelwise_mul_float.cl" }, + { "pixelwise_mul_int", "pixelwise_mul_int.cl" }, + { "pooling_layer_2", "pooling_layer.cl" }, + { "pooling_layer_3", "pooling_layer.cl" }, + { "remap_nearest_neighbour", "remap.cl" }, + { "remap_bilinear", "remap.cl" }, + { "reshape_to_columns", "convolution_layer.cl" }, + { "RGB888_to_IYUV_bt709", "color_convert.cl" }, + { "RGB888_to_NV12_bt709", "color_convert.cl" }, + { "RGB888_to_RGBA8888_bt709", "color_convert.cl" }, + { "RGB888_to_YUV444_bt709", "color_convert.cl" }, + { "RGBA8888_to_IYUV_bt709", "color_convert.cl" }, + { "RGBA8888_to_NV12_bt709", "color_convert.cl" }, + { "RGBA8888_to_RGB888_bt709", "color_convert.cl" }, + { "RGBA8888_to_YUV444_bt709", "color_convert.cl" }, + { "scale_nearest_neighbour", "scale.cl" }, + { "scale_bilinear", "scale.cl" }, + { "scharr3x3", "scharr_filter.cl" }, + { "sobel3x3", "sobel_filter.cl" }, + { "sobel_separable5x1", "sobel_filter.cl" }, + { "sobel_separable1x5", "sobel_filter.cl" }, + { "sobel_separable7x1", "sobel_filter.cl" }, + { "sobel_separable1x7", "sobel_filter.cl" }, + { "softmax_layer_max", "softmax_layer.cl" }, + { "softmax_layer_shift_exp_sum", "softmax_layer.cl" }, + { "softmax_layer_norm", "softmax_layer.cl" }, + { "suppress_non_maximum", "canny.cl" }, + { "tablelookup_U8", "tablelookup.cl" }, + { "tablelookup_S16", "tablelookup.cl" }, + { "threshold_binary", "threshold.cl" }, + { "threshold_range", "threshold.cl" }, + { "transpose", "transpose.cl" }, + { "UYVY422_to_IYUV_bt709", "color_convert.cl" }, + { "UYVY422_to_NV12_bt709", "color_convert.cl" }, + { "UYVY422_to_RGB888_bt709", "color_convert.cl" }, + { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" }, + { "warp_affine_nearest_neighbour", "warp_affine.cl" }, + { "warp_affine_bilinear", "warp_affine.cl" }, + { "warp_perspective_nearest_neighbour", "warp_perspective.cl" }, + { "warp_perspective_bilinear", "warp_perspective.cl" }, + { "YUYV422_to_IYUV_bt709", "color_convert.cl" }, + { "YUYV422_to_NV12_bt709", "color_convert.cl" }, + { "YUYV422_to_RGB888_bt709", "color_convert.cl" }, + { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" }, +}; + +const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = +{ +#ifdef EMBEDDED_KERNELS + { + "absdiff.cl", +#include "./cl_kernels/absdiff.clembed" + }, + { + "accumulate.cl", +#include "./cl_kernels/accumulate.clembed" + }, + { + "activation_layer.cl", +#include "./cl_kernels/activation_layer.clembed" + }, + { + "arithmetic_op.cl", +#include "./cl_kernels/arithmetic_op.clembed" + }, + { + "bitwise_op.cl", +#include "./cl_kernels/bitwise_op.clembed" + }, + { + "canny.cl", +#include "./cl_kernels/canny.clembed" + }, + { + "channel_combine.cl", +#include "./cl_kernels/channel_combine.clembed" + }, + { + "channel_extract.cl", +#include "./cl_kernels/channel_extract.clembed" + }, + { + "concatenate.cl", +#include "./cl_kernels/concatenate.clembed" + }, + { + "color_convert.cl", +#include "./cl_kernels/color_convert.clembed" + }, + { + "convolution3x3.cl", +#include "./cl_kernels/convolution3x3.clembed" + }, + { + "convolution5x5.cl", +#include "./cl_kernels/convolution5x5.clembed" + }, + { + "convolution7x7.cl", +#include "./cl_kernels/convolution7x7.clembed" + }, + { + "convolution9x9.cl", +#include "./cl_kernels/convolution9x9.clembed" + }, + { + "convolution_layer.cl", +#include "./cl_kernels/convolution_layer.clembed" + }, + { + "convolution_rectangle.cl", +#include "./cl_kernels/convolution_rectangle.clembed" + }, + { + "depth_convert.cl", +#include "./cl_kernels/depth_convert.clembed" + }, + { + "derivative.cl", +#include "./cl_kernels/derivative.clembed" + }, + { + "dilate.cl", +#include "./cl_kernels/dilate.clembed" + }, + { + "erode.cl", +#include "./cl_kernels/erode.clembed" + }, + { + "fast_corners.cl", +#include "./cl_kernels/fast_corners.clembed" + }, + { + "fill_border.cl", +#include "./cl_kernels/fill_border.clembed" + }, + { + "gaussian_pyramid.cl", +#include "./cl_kernels/gaussian_pyramid.clembed" + }, + { + "gemm.cl", +#include "./cl_kernels/gemm.clembed" + }, + { + "harris_corners.cl", +#include "./cl_kernels/harris_corners.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "histogram.cl", +#include "./cl_kernels/histogram.clembed" + }, + { + "hog.cl", +#include "./cl_kernels/hog.clembed" + }, + { + "integral_image.cl", +#include "./cl_kernels/integral_image.clembed" + }, + { + "magnitude_phase.cl", +#include "./cl_kernels/magnitude_phase.clembed" + }, + { + "mean_stddev.cl", +#include "./cl_kernels/mean_stddev.clembed" + }, + { + "minmaxloc.cl", +#include "./cl_kernels/minmaxloc.clembed" + }, + { + "non_linear_filter3x3.cl", +#include "./cl_kernels/non_linear_filter3x3.clembed" + }, + { + "non_linear_filter5x5.cl", +#include "./cl_kernels/non_linear_filter5x5.clembed" + }, + { + "non_linear_filter_helpers.h", +#include "./cl_kernels/non_linear_filter_helpers.hembed" + }, + { + "nonmax.cl", +#include "./cl_kernels/nonmax.clembed" + }, + { + "normalization_layer.cl", +#include "./cl_kernels/normalization_layer.clembed" + }, + { + "batchnormalization_layer.cl", +#include "./cl_kernels/batchnormalization_layer.clembed" + }, + { + "optical_flow_pyramid_lk.cl", +#include "./cl_kernels/optical_flow_pyramid_lk.clembed" + }, + { + "pixelwise_mul_float.cl", +#include "./cl_kernels/pixelwise_mul_float.clembed" + }, + { + "pixelwise_mul_int.cl", +#include "./cl_kernels/pixelwise_mul_int.clembed" + }, + { + "pooling_layer.cl", +#include "./cl_kernels/pooling_layer.clembed" + }, + { + "remap.cl", +#include "./cl_kernels/remap.clembed" + }, + { + "scale.cl", +#include "./cl_kernels/scale.clembed" + }, + { + "scharr_filter.cl", +#include "./cl_kernels/scharr_filter.clembed" + }, + { + "sobel_filter.cl", +#include "./cl_kernels/sobel_filter.clembed" + }, + { + "softmax_layer.cl", +#include "./cl_kernels/softmax_layer.clembed" + }, + { + "tablelookup.cl", +#include "./cl_kernels/tablelookup.clembed" + }, + { + "threshold.cl", +#include "./cl_kernels/threshold.clembed" + }, + { + "transpose.cl", +#include "./cl_kernels/transpose.clembed" + }, + { + "types.h", +#include "./cl_kernels/types.hembed" + }, + { + "warp_affine.cl", +#include "./cl_kernels/warp_affine.clembed" + }, + { + "warp_helpers.h", +#include "./cl_kernels/warp_helpers.hembed" + }, + { + "warp_perspective.cl", +#include "./cl_kernels/warp_perspective.clembed" + } +#endif +}; + +CLKernelLibrary::CLKernelLibrary() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ +} + +CLKernelLibrary &CLKernelLibrary::get() +{ + static CLKernelLibrary _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if(_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set); + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if(_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +const Program &CLKernelLibrary::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if(program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if(_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if(std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if(std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibrary::stringify_set(const StringSet &s) const +{ + std::string concat_set = "-cl-arm-non-uniform-work-group-size "; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for(const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} diff --git a/src/core/CL/ICLDistribution1D.cpp b/src/core/CL/ICLDistribution1D.cpp new file mode 100644 index 0000000000..a645d0ed71 --- /dev/null +++ b/src/core/CL/ICLDistribution1D.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLDistribution1D.h" + +#include "arm_compute/core/Error.h" + +using namespace arm_compute; + +ICLDistribution1D::ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range) + : IDistribution1D(num_bins, offset, range), _mapping(nullptr) +{ +} + +void ICLDistribution1D::map(cl::CommandQueue &q, bool blocking) +{ + ARM_COMPUTE_ERROR_ON(_mapping != nullptr); + _mapping = do_map(q, blocking); +} + +void ICLDistribution1D::unmap(cl::CommandQueue &q) +{ + ARM_COMPUTE_ERROR_ON(_mapping == nullptr); + do_unmap(q); + _mapping = nullptr; +} + +uint32_t *ICLDistribution1D::buffer() const +{ + return _mapping; +} diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp new file mode 100644 index 0000000000..e1829971cf --- /dev/null +++ b/src/core/CL/ICLHOG.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLHOG.h" + +using namespace arm_compute; + +ICLHOG::ICLHOG() + : _mapping(nullptr) +{ +} + +void ICLHOG::map(cl::CommandQueue &q, bool blocking) +{ + _mapping = do_map(q, blocking); +} + +void ICLHOG::unmap(cl::CommandQueue &q) +{ + do_unmap(q); + _mapping = nullptr; +} + +float *ICLHOG::descriptor() const +{ + return reinterpret_cast<float *>(_mapping); +}
\ No newline at end of file diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp new file mode 100644 index 0000000000..7ac0fe3bbb --- /dev/null +++ b/src/core/CL/ICLKernel.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cstddef> + +using namespace arm_compute; + +void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint) +{ + if(kernel.kernel()() == nullptr) + { + return; + } + + ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start()))); + + cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(), + (window.y().end() - window.y().start()) / window.y().step(), + (window.z().end() - window.z().start()) / window.z().step()); + + cl::NDRange lws = cl::NullRange; + + if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2])) + { + lws = lws_hint; + } + + queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); +} + +ICLKernel::ICLKernel() + : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target()) +{ +} + +cl::Kernel &ICLKernel::kernel() +{ + return _kernel; +} + +template <unsigned int dimension_size> +unsigned int ICLKernel::num_arguments_per_tensor() const +{ + return 2 + 2 * dimension_size; +} + +template <unsigned int dimension_size> +void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + + const ITensorInfo *info = tensor->info(); + const Strides &strides = info->strides_in_bytes(); + + // Calculate offset to the start of the window + unsigned int offset_first_element = info->offset_first_element_in_bytes(); + + for(unsigned int n = 0; n < info->num_dimensions(); ++n) + { + offset_first_element += window[n].start() * strides[n]; + } + + unsigned int idx_start = idx; + _kernel.setArg(idx++, tensor->cl_buffer()); + + for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + { + _kernel.setArg<cl_uint>(idx++, strides[dimension]); + _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step()); + } + + _kernel.setArg<cl_uint>(idx++, offset_first_element); + + ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx, + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>()); + ARM_COMPUTE_UNUSED(idx_start); +} + +void ICLKernel::add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) +{ + add_tensor_argument<1>(idx, tensor, window); +} + +void ICLKernel::add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) +{ + add_tensor_argument<2>(idx, tensor, window); +} + +void ICLKernel::add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) +{ + add_tensor_argument<3>(idx, tensor, window); +} + +unsigned int ICLKernel::num_arguments_per_1D_tensor() const +{ + return num_arguments_per_tensor<1>(); +} + +unsigned int ICLKernel::num_arguments_per_2D_tensor() const +{ + return num_arguments_per_tensor<2>(); +} + +unsigned int ICLKernel::num_arguments_per_3D_tensor() const +{ + return num_arguments_per_tensor<3>(); +} + +void ICLKernel::set_target(cl::Device &device) +{ + _target = get_target_from_device(device); +} + +void ICLKernel::set_target(GPUTarget target) +{ + _target = target; +} + +GPUTarget ICLKernel::get_target() const +{ + return _target; +} diff --git a/src/core/CL/ICLLut.cpp b/src/core/CL/ICLLut.cpp new file mode 100644 index 0000000000..ea9deac6dc --- /dev/null +++ b/src/core/CL/ICLLut.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLLut.h" + +using namespace arm_compute; + +ICLLut::ICLLut() + : _mapping(nullptr) +{ +} + +void ICLLut::map(cl::CommandQueue &q, bool blocking) +{ + _mapping = do_map(q, blocking); +} + +void ICLLut::unmap(cl::CommandQueue &q) +{ + do_unmap(q); + _mapping = nullptr; +} + +uint8_t *ICLLut::buffer() const +{ + return _mapping; +} diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp new file mode 100644 index 0000000000..8ece566e83 --- /dev/null +++ b/src/core/CL/ICLMultiHOG.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLMultiHOG.h" + +#include "arm_compute/core/IHOG.h" + +using namespace arm_compute; + +IHOG *ICLMultiHOG::model(size_t index) +{ + return cl_model(index); +} + +const IHOG *ICLMultiHOG::model(size_t index) const +{ + return cl_model(index); +}
\ No newline at end of file diff --git a/src/core/CL/ICLMultiImage.cpp b/src/core/CL/ICLMultiImage.cpp new file mode 100644 index 0000000000..dbf3fe3e6f --- /dev/null +++ b/src/core/CL/ICLMultiImage.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLMultiImage.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/ITensor.h" + +using namespace arm_compute; + +IImage *ICLMultiImage::plane(unsigned int index) +{ + return cl_plane(index); +} + +const IImage *ICLMultiImage::plane(unsigned int index) const +{ + return cl_plane(index); +} diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp new file mode 100644 index 0000000000..5dc3e6c8bb --- /dev/null +++ b/src/core/CL/ICLSimple2DKernel.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLSimple2DKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp new file mode 100644 index 0000000000..7b0d011b3e --- /dev/null +++ b/src/core/CL/ICLSimple3DKernel.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLSimple3DKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp new file mode 100644 index 0000000000..fec9d923da --- /dev/null +++ b/src/core/CL/ICLSimpleKernel.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLSimpleKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +ICLSimpleKernel::ICLSimpleKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size) +{ + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp new file mode 100644 index 0000000000..4a7952e108 --- /dev/null +++ b/src/core/CL/ICLTensor.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/ICLTensor.h" + +#include <cstring> + +using namespace arm_compute; + +ICLTensor::ICLTensor() + : _mapping(nullptr) +{ +} + +void ICLTensor::map(cl::CommandQueue &q, bool blocking) +{ + _mapping = do_map(q, blocking); +} + +void ICLTensor::unmap(cl::CommandQueue &q) +{ + do_unmap(q); + _mapping = nullptr; +} + +void ICLTensor::clear(cl::CommandQueue &q) +{ + this->map(q); + std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size()); + this->unmap(q); +} + +uint8_t *ICLTensor::buffer() const +{ + return _mapping; +} diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp new file mode 100644 index 0000000000..3b8dfd2465 --- /dev/null +++ b/src/core/CL/OpenCL.cpp @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <dlfcn.h> +#include <iostream> + +using clBuildProgram_func = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *); +using clEnqueueNDRangeKernel_func = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +using clSetKernelArg_func = cl_int (*)(cl_kernel, cl_uint, size_t, const void *); +using clReleaseMemObject_func = cl_int (*)(cl_mem); +using clEnqueueUnmapMemObject_func = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *); +using clRetainCommandQueue_func = cl_int (*)(cl_command_queue command_queue); +using clReleaseContext_func = cl_int (*)(cl_context); +using clReleaseEvent_func = cl_int (*)(cl_event); +using clEnqueueWriteBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +using clEnqueueReadBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); +using clGetProgramBuildInfo_func = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); +using clRetainProgram_func = cl_int (*)(cl_program program); +using clEnqueueMapBuffer_func = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *); +using clReleaseCommandQueue_func = cl_int (*)(cl_command_queue); +using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); +using clRetainContext_func = cl_int (*)(cl_context context); +using clReleaseProgram_func = cl_int (*)(cl_program program); +using clFlush_func = cl_int (*)(cl_command_queue command_queue); +using clGetProgramInfo_func = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *); +using clCreateKernel_func = cl_kernel (*)(cl_program, const char *, cl_int *); +using clRetainKernel_func = cl_int (*)(cl_kernel kernel); +using clCreateBuffer_func = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *); +using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *); +using clReleaseKernel_func = cl_int (*)(cl_kernel kernel); +using clGetDeviceInfo_func = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *); +using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); + +class CLSymbols +{ +private: + CLSymbols() + { + void *handle = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_LOCAL); + if(handle == nullptr) + { + std::cerr << "Can't load libOpenCL.so: " << dlerror() << std::endl; + } + else + { + clBuildProgram = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram")); + clEnqueueNDRangeKernel = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel")); + clSetKernelArg = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg")); + clReleaseKernel = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel")); + clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource")); + clCreateBuffer = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer")); + clRetainKernel = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel")); + clCreateKernel = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel")); + clGetProgramInfo = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo")); + clFlush = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush")); + clReleaseProgram = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram")); + clRetainContext = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext")); + clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary")); + clReleaseCommandQueue = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue")); + clEnqueueMapBuffer = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer")); + clRetainProgram = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram")); + clGetProgramBuildInfo = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo")); + clEnqueueReadBuffer = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer")); + clEnqueueWriteBuffer = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer")); + clReleaseEvent = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent")); + clReleaseContext = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext")); + clRetainCommandQueue = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue")); + clEnqueueUnmapMemObject = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject")); + clReleaseMemObject = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject")); + clGetDeviceInfo = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo")); + clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs")); + dlclose(handle); + } + } + +public: + static CLSymbols &get() + { + static CLSymbols symbols = CLSymbols(); + return symbols; + } + + clBuildProgram_func clBuildProgram = nullptr; + clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr; + clSetKernelArg_func clSetKernelArg = nullptr; + clReleaseKernel_func clReleaseKernel = nullptr; + clCreateProgramWithSource_func clCreateProgramWithSource = nullptr; + clCreateBuffer_func clCreateBuffer = nullptr; + clRetainKernel_func clRetainKernel = nullptr; + clCreateKernel_func clCreateKernel = nullptr; + clGetProgramInfo_func clGetProgramInfo = nullptr; + clFlush_func clFlush = nullptr; + clReleaseProgram_func clReleaseProgram = nullptr; + clRetainContext_func clRetainContext = nullptr; + clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr; + clReleaseCommandQueue_func clReleaseCommandQueue = nullptr; + clEnqueueMapBuffer_func clEnqueueMapBuffer = nullptr; + clRetainProgram_func clRetainProgram = nullptr; + clGetProgramBuildInfo_func clGetProgramBuildInfo = nullptr; + clEnqueueReadBuffer_func clEnqueueReadBuffer = nullptr; + clEnqueueWriteBuffer_func clEnqueueWriteBuffer = nullptr; + clReleaseEvent_func clReleaseEvent = nullptr; + clReleaseContext_func clReleaseContext = nullptr; + clRetainCommandQueue_func clRetainCommandQueue = nullptr; + clEnqueueUnmapMemObject_func clEnqueueUnmapMemObject = nullptr; + clReleaseMemObject_func clReleaseMemObject = nullptr; + clGetDeviceInfo_func clGetDeviceInfo = nullptr; + clGetDeviceIDs_func clGetDeviceIDs = nullptr; +}; + +bool arm_compute::opencl_is_available() +{ + return CLSymbols::get().clBuildProgram != nullptr; +} + +cl_int clBuildProgram( + cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) +{ + auto func = CLSymbols::get().clBuildProgram; + if(func != nullptr) + { + return func(program, num_devices, device_list, options, pfn_notify, user_data); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clEnqueueNDRangeKernel( + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) +{ + auto func = CLSymbols::get().clEnqueueNDRangeKernel; + if(func != nullptr) + { + return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clSetKernelArg( + cl_kernel kernel, + cl_uint arg_index, + size_t arg_size, + const void *arg_value) +{ + auto func = CLSymbols::get().clSetKernelArg; + if(func != nullptr) + { + return func(kernel, arg_index, arg_size, arg_value); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clReleaseMemObject(cl_mem memobj) +{ + auto func = CLSymbols::get().clReleaseMemObject; + if(func != nullptr) + { + return func(memobj); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clEnqueueUnmapMemObject( + cl_command_queue command_queue, + cl_mem memobj, + void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) +{ + auto func = CLSymbols::get().clEnqueueUnmapMemObject; + if(func != nullptr) + { + return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clRetainCommandQueue(cl_command_queue command_queue) +{ + auto func = CLSymbols::get().clRetainCommandQueue; + if(func != nullptr) + { + return func(command_queue); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clReleaseContext(cl_context context) +{ + auto func = CLSymbols::get().clReleaseContext; + if(func != nullptr) + { + return func(context); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} +cl_int clReleaseEvent(cl_event event) +{ + auto func = CLSymbols::get().clReleaseEvent; + if(func != nullptr) + { + return func(event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clEnqueueWriteBuffer( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) +{ + auto func = CLSymbols::get().clEnqueueWriteBuffer; + if(func != nullptr) + { + return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clEnqueueReadBuffer( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) +{ + auto func = CLSymbols::get().clEnqueueReadBuffer; + if(func != nullptr) + { + return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clGetProgramBuildInfo( + cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + auto func = CLSymbols::get().clGetProgramBuildInfo; + if(func != nullptr) + { + return func(program, device, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clRetainProgram(cl_program program) +{ + auto func = CLSymbols::get().clRetainProgram; + if(func != nullptr) + { + return func(program); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +void *clEnqueueMapBuffer( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event, + cl_int *errcode_ret) +{ + auto func = CLSymbols::get().clEnqueueMapBuffer; + if(func != nullptr) + { + return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + +cl_int clReleaseCommandQueue(cl_command_queue command_queue) +{ + auto func = CLSymbols::get().clReleaseCommandQueue; + if(func != nullptr) + { + return func(command_queue); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_program clCreateProgramWithBinary( + cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret) +{ + auto func = CLSymbols::get().clCreateProgramWithBinary; + if(func != nullptr) + { + return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + +cl_int clRetainContext(cl_context context) +{ + auto func = CLSymbols::get().clRetainContext; + if(func != nullptr) + { + return func(context); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clReleaseProgram(cl_program program) +{ + auto func = CLSymbols::get().clReleaseProgram; + if(func != nullptr) + { + return func(program); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clFlush(cl_command_queue command_queue) +{ + auto func = CLSymbols::get().clFlush; + if(func != nullptr) + { + return func(command_queue); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clGetProgramInfo( + cl_program program, + cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + auto func = CLSymbols::get().clGetProgramInfo; + if(func != nullptr) + { + return func(program, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_kernel clCreateKernel( + cl_program program, + const char *kernel_name, + cl_int *errcode_ret) +{ + auto func = CLSymbols::get().clCreateKernel; + if(func != nullptr) + { + return func(program, kernel_name, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + +cl_int clRetainKernel(cl_kernel kernel) +{ + auto func = CLSymbols::get().clRetainKernel; + if(func != nullptr) + { + return func(kernel); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_mem clCreateBuffer( + cl_context context, + cl_mem_flags flags, + size_t size, + void *host_ptr, + cl_int *errcode_ret) +{ + auto func = CLSymbols::get().clCreateBuffer; + if(func != nullptr) + { + return func(context, flags, size, host_ptr, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + +cl_program clCreateProgramWithSource( + cl_context context, + cl_uint count, + const char **strings, + const size_t *lengths, + cl_int *errcode_ret) +{ + auto func = CLSymbols::get().clCreateProgramWithSource; + if(func != nullptr) + { + return func(context, count, strings, lengths, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + +cl_int clReleaseKernel(cl_kernel kernel) +{ + auto func = CLSymbols::get().clReleaseKernel; + if(func != nullptr) + { + return func(kernel); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clGetDeviceIDs(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, + cl_device_id *devices, + cl_uint *num_devices) +{ + auto func = CLSymbols::get().clGetDeviceIDs; + if(func != nullptr) + { + return func(platform, device_type, num_entries, devices, num_devices); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clGetDeviceInfo(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + auto func = CLSymbols::get().clGetDeviceInfo; + if(func != nullptr) + { + return func(device, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl new file mode 100644 index 0000000000..1761342eb4 --- /dev/null +++ b/src/core/CL/cl_kernels/absdiff.cl @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Calculate the absolute difference of two input images. + * + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * + * @param[in] in1_ptr Pointer to the first source image. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] in1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] in1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] in2_ptr Pointer to the second source image. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the second source image in X dimension (in bytes) + * @param[in] in2_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the second source image in Y dimension (in bytes) + * @param[in] in2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the second source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void absdiff( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl new file mode 100644 index 0000000000..39c1512c3c --- /dev/null +++ b/src/core/CL/cl_kernels/accumulate.cl @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function accumulates an input image into output image. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void accumulate( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(accu)) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image accu = CONVERT_TO_IMAGE_STRUCT(accu); + + // Load data + uchar16 in_data = vload16(0, input.ptr); + short16 accu_data = vload16(0, (__global short *)accu.ptr); + + // Perform accumulation + short16 res = add_sat(convert_short16(in_data), accu_data); + + // Store result + vstore16(res, 0, (__global short *)accu.ptr); +} + +/** This function accumulates a weighted value from an input image to an output image. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] alpha The float scalar value with a value in the range of 0 to 1 + */ +__kernel void accumulate_weighted( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(accu), + const float alpha) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image accu = CONVERT_TO_IMAGE_STRUCT(accu); + + // Load data + const float16 in_data = convert_float16(vload16(0, input.ptr)); + const float16 accu_data = convert_float16(vload16(0, accu.ptr)); + + // Calculate weighted accumulation + const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data); + + // Store result + vstore16(res, 0, accu.ptr); +} + +/** This function accumulates a squared value from an input image to an output image. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] shift The U32 scalar value with a value in the range of 0 to 15 + */ +__kernel void accumulate_squared( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(accu), + const uint shift) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image accu = CONVERT_TO_IMAGE_STRUCT(accu); + + // Load data + ushort16 in_data = convert_ushort16(vload16(0, input.ptr)); + uint16 accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr)); + + // Calculate squared accumulation + short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift)); + + // Store result + vstore16(res, 0, (__global short *)accu.ptr); +} diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl new file mode 100644 index 0000000000..e3cbb6c801 --- /dev/null +++ b/src/core/CL/cl_kernels/activation_layer.cl @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This performs an activation function floating point inputs. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH + * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument + * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void activation_layer( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + // Load data + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)input.ptr); + + // Perform activation +#if defined LOGISTIC + data = 1 / (1 + exp(-data)); +#elif defined TANH + data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data); +#elif defined RELU + data = max(0, data); +#elif defined BRELU + data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data)); +#elif defined SRELU + data = log(1 + exp(data)); +#elif defined ABS +#if defined TYPE_INT + data = abs(data); +#else + data = fabs(data); +#endif +#elif defined SQUARE + data = data * data; +#elif defined SQRT + data = sqrt(data); +#elif defined LINEAR + data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B; +#endif + + // Store result + vstore16(data, 0, (__global DATA_TYPE *)output.ptr); +} diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl new file mode 100644 index 0000000000..434300efa8 --- /dev/null +++ b/src/core/CL/cl_kernels/arithmetic_op.cl @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifdef SATURATE +#define ADD(x, y) add_sat((x), (y)) +#define SUB(x, y) sub_sat((x), (y)) +#else +#define ADD(x, y) (x) + (y) +#define SUB(x, y) (x) - (y) +#endif + +/** This function add two images. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void arithmetic_add( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + // Get pixels pointer + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load values + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + // Calculate and store result + vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); +} + +/** This function subtracts one image from another. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void arithmetic_sub( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + // Get pixels pointer + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load values + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + // Calculate and store result + vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl new file mode 100644 index 0000000000..13e6702334 --- /dev/null +++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Apply batch normalization. + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: F32 + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: F32 + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: F32 + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: F32 + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var), + VECTOR_DECLARATION(beta), + VECTOR_DECLARATION(gamma), + float epsilon) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector var = CONVERT_TO_VECTOR_STRUCT(var); + Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); + Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); + + float4 _in = 0; + float4 denominator = 0; + float4 numerator = 0; + float4 x_bar = 0; + float4 gamma_vec = 0; + float4 beta_vec = 0; + + const int current_slice = get_global_id(2); + + _in = vload4(0, (__global float *)in.ptr); + denominator = *((__global float *)(var.ptr + current_slice * var.stride_x)); + denominator = rsqrt(denominator + epsilon); + + // Calculate x bar and store results + numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x)); + numerator = _in - numerator; + x_bar = numerator * denominator; + + gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x)); + beta_vec = *((__global float *)(beta.ptr + current_slice * beta.stride_x)); + + vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl new file mode 100644 index 0000000000..135bfa989c --- /dev/null +++ b/src/core/CL/cl_kernels/bitwise_op.cl @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function computes the bitwise OR of two input images. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void bitwise_or( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + uchar16 in_a = vload16(0, in1.ptr); + uchar16 in_b = vload16(0, in2.ptr); + + vstore16(in_a | in_b, 0, out.ptr); +} + +/** This function computes the bitwise AND of two input images. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void bitwise_and( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + uchar16 in_a = vload16(0, in1.ptr); + uchar16 in_b = vload16(0, in2.ptr); + + vstore16(in_a & in_b, 0, out.ptr); +} + +/** This function computes the bitwise XOR of two input images. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void bitwise_xor( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + uchar16 in_a = vload16(0, in1.ptr); + uchar16 in_b = vload16(0, in2.ptr); + + vstore16(in_a ^ in_b, 0, out.ptr); +} + +/** This function computes the bitwise NOT of an image. + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void bitwise_not( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + uchar16 in_data = vload16(0, in.ptr); + + vstore16(~in_data, 0, out.ptr); +} diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl new file mode 100644 index 0000000000..ec6719213c --- /dev/null +++ b/src/core/CL/cl_kernels/canny.cl @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Calculate the magnitude and phase from horizontal and vertical result of sobel result. + * + * @note The calculation of gradient uses level 1 normalisation. + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * + * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 + * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 + * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 + * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 + * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output + */ +__kernel void combine_gradients_L1( + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(src2), + IMAGE_DECLARATION(grad), + IMAGE_DECLARATION(angle)) +{ + // Construct images + Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); + Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); + Image grad = CONVERT_TO_IMAGE_STRUCT(grad); + Image angle = CONVERT_TO_IMAGE_STRUCT(angle); + + // Load sobel horizontal and vertical values + VEC_DATA_TYPE(DATA_TYPE_IN, 4) + h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr); + VEC_DATA_TYPE(DATA_TYPE_IN, 4) + v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr); + + /* Calculate the gradient, using level 1 normalisation method */ + VEC_DATA_TYPE(DATA_TYPE_OUT, 4) + m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4)); + + /* Calculate the angle */ + float4 p = atan2pi(convert_float4(v), convert_float4(h)); + + /* Remap angle to range [0, 256) */ + p = select(p, p + 2, p < 0.0f) * 128.0f; + + /* Store results */ + vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr); + vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); +} + +/** Calculate the gradient and angle from horizontal and vertical result of sobel result. + * + * @note The calculation of gradient uses level 2 normalisation + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * + * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 + * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 + * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 + * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 + * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output + */ +__kernel void combine_gradients_L2( + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(src2), + IMAGE_DECLARATION(grad), + IMAGE_DECLARATION(angle)) +{ + // Construct images + Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); + Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); + Image grad = CONVERT_TO_IMAGE_STRUCT(grad); + Image angle = CONVERT_TO_IMAGE_STRUCT(angle); + + // Load sobel horizontal and vertical values + float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr)); + float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr)); + + /* Calculate the gradient, using level 2 normalisation method */ + float4 m = sqrt(h * h + v * v); + + /* Calculate the angle */ + float4 p = atan2pi(v, h); + + /* Remap angle to range [0, 256) */ + p = select(p, p + 2, p < 0.0f) * 128.0f; + + /* Store results */ + vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr); + vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); +} + +/** Array that holds the relative coordinates offset for the neighbouring pixels. + */ +__constant short4 neighbours_coords[] = +{ + { -1, 0, 1, 0 }, // 0 + { -1, 1, 1, -1 }, // 45 + { 0, 1, 0, -1 }, // 90 + { 1, 1, -1, -1 }, // 135 + { 1, 0, -1, 0 }, // 180 + { 1, -1, -1, 1 }, // 225 + { 0, 1, 0, -1 }, // 270 + { -1, -1, 1, 1 }, // 315 + { -1, 0, 1, 0 }, // 360 +}; + +/** Perform non maximum suppression. + * + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * + * @param[in] grad_ptr Pointer to the gradient output. Supported data types: S16, S32 + * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output + * @param[in] angle_ptr Pointer to the angle output. Supported data types: U8 + * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] angle_offset_first_element_in_bytes TThe offset of the first element of the output + * @param[out] non_max_ptr Pointer to the non maximum suppressed output. Supported data types: U16, U32 + * @param[in] non_max_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] non_max_step_x non_max_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] non_max_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] non_max_step_y non_max_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] non_max_offset_first_element_in_bytes The offset of the first element of the output + * @param[in] lower_thr The low threshold + */ +__kernel void suppress_non_maximum( + IMAGE_DECLARATION(grad), + IMAGE_DECLARATION(angle), + IMAGE_DECLARATION(non_max), + uint lower_thr) +{ + // Construct images + Image grad = CONVERT_TO_IMAGE_STRUCT(grad); + Image angle = CONVERT_TO_IMAGE_STRUCT(angle); + Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max); + + // Get gradient and angle + DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr); + uchar an = convert_ushort(*angle.ptr); + + if(gradient <= lower_thr) + { + return; + } + + // Divide the whole round into 8 directions + uchar ang = 127 - an; + DATA_TYPE_OUT q_an = (ang + 16) >> 5; + + // Find the two pixels in the perpendicular direction + short2 x_p = neighbours_coords[q_an].s02; + short2 y_p = neighbours_coords[q_an].s13; + DATA_TYPE_IN g1 = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x)); + DATA_TYPE_IN g2 = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y)); + + if((gradient > g1) && (gradient > g2)) + { + *((global DATA_TYPE_OUT *)non_max.ptr) = gradient; + } +} + +#define EDGE 255 +#define hysteresis_local_stack_L1 8 // The size of level 1 stack. This has to agree with the host side +#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation + +/** Check whether pixel is valid +* +* Skip the pixel if the early_test fails. +* Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full +* +* @param[in] early_test Boolean condition based on the minv check and visited buffer check +* @param[in] x_pos X-coordinate of pixel that is going to be recorded, has to be within the boundary +* @param[in] y_pos Y-coordinate of pixel that is going to be recorded, has to be within the boundary +* @param[in] x_cur X-coordinate of current central pixel +* @param[in] y_cur Y-coordinate of current central pixel +*/ +#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur) \ + { \ + if(!early_test) \ + { \ + /* Number of elements in the local stack 1, points to next available entry */ \ + c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)); \ + \ + if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */ \ + goto pop_stack; \ + \ + /* The pixel that has already been recorded is ignored */ \ + if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1)) \ + { \ + l1_ptr[c] = (short2)(x_pos, y_pos); \ + *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1; \ + } \ + } \ + } + +/** Perform hysteresis. + * + * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short + * + * @param[in] src_ptr Pointer to the input image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] out_ptr Pointer to the output image. Supported data types: U8 + * @param[in] out_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] visited_ptr Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32 + * @param[in] visited_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] visited_step_x visited_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] visited_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] visited_step_y visited_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] visited_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] recorded_ptr Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32 + * @param[in] recorded_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] recorded_step_x recorded_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] recorded_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] recorded_step_y recorded_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] recorded_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] l1_stack_ptr Pointer to the l1 stack of a pixel. Supported data types: S32 + * @param[in] l1_stack_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] l1_stack_step_x l1_stack_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] l1_stack_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] l1_stack_step_y l1_stack_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] l1_stack_offset_first_element_in_bytes The offset of the first element of the output + * @param[out] l1_stack_counter_ptr Pointer to the l1 stack counters of an image. Supported data types: U8 + * @param[in] l1_stack_counter_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] l1_stack_counter_step_x l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] l1_stack_counter_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] l1_stack_counter_step_y l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output + * @param[in] low_thr The lower threshold + * @param[in] up_thr The upper threshold + * @param[in] width The width of the image. + * @param[in] height The height of the image + */ +kernel void hysteresis( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(visited), + IMAGE_DECLARATION(recorded), + IMAGE_DECLARATION(l1_stack), + IMAGE_DECLARATION(l1_stack_counter), + uint low_thr, + uint up_thr, + int width, + int height) +{ + // Create images + Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); + Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out); + Image visited = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited); + Image recorded = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded); + Image l1_stack = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack); + Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter); + + // Index + int x = get_global_id(0); + int y = get_global_id(1); + + // Load value + DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y)); + + // If less than upper threshold set to NO_EDGE and return + if(val <= up_thr) + { + *offset(&out, x, y) = 0; + return; + } + + // Init local stack 2 + short2 stack_L2[hysteresis_local_stack_L2] = { 0 }; + int L2_counter = 0; + + // Perform recursive hysteresis + while(true) + { + // Get L1 stack pointer + __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x); + + // If the pixel has already been visited, proceed with the items in the stack instead + if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0) + { + goto pop_stack; + } + + // Set strong edge + *offset(&out, x, y) = EDGE; + + // If it is the top of stack l2, we don't need check the surrounding pixels + if(L2_counter > (hysteresis_local_stack_L2 - 1)) + { + goto pop_stack2; + } + + // Points to the start of the local stack; + char c; + + VEC_DATA_TYPE(DATA_TYPE_IN, 4) + x_tmp; + uint4 v_tmp; + + // Get direction pixel indices + int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2); + + // Check 8 pixels around for week edges where low_thr < val <= up_thr + x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N)); + v_tmp = vload4(0, (__global uint *)offset(&visited, W, N)); + check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW + check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N + check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE + + x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y)); + v_tmp = vload4(0, (__global uint *)offset(&visited, W, y)); + check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W + check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E + + x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S)); + v_tmp = vload4(0, (__global uint *)offset(&visited, W, S)); + check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW + check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S + check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE + +#undef check_pixel + +pop_stack: + c = *((__global char *)offset(&l1_stack_counter, x, y)); + + if(c >= 1) + { + *((__global char *)offset(&l1_stack_counter, x, y)) -= 1; + int2 l_c = convert_int2(l1_ptr[c - 1]); + + // Push the current position into level 2 stack + stack_L2[L2_counter].x = x; + stack_L2[L2_counter].y = y; + + x = l_c.x; + y = l_c.y; + + L2_counter++; + + continue; + } + + if(L2_counter > 0) + { + goto pop_stack2; + } + else + { + return; + } + +pop_stack2: + L2_counter--; + x = stack_L2[L2_counter].x; + y = stack_L2[L2_counter].y; + }; +} diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl new file mode 100644 index 0000000000..93e80b925e --- /dev/null +++ b/src/core/CL/cl_kernels/channel_combine.cl @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function combines three planes to a single RGB image. + * + * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGB + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_combine_RGB888( + IMAGE_DECLARATION(plane0), + IMAGE_DECLARATION(plane1), + IMAGE_DECLARATION(plane2), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); + Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); + Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data0 = vload16(0, plane0.ptr); + uchar16 data1 = vload16(0, plane1.ptr); + uchar16 data2 = vload16(0, plane2.ptr); + + uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, + data0.s1, data1.s1, data2.s1, + data0.s2, data1.s2, data2.s2, + data0.s3, data1.s3, data2.s3, + data0.s4, data1.s4, data2.s4, + data0.s5); + vstore16(out0, 0, dst.ptr); + + uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6, + data1.s6, data2.s6, data0.s7, + data1.s7, data2.s7, data0.s8, + data1.s8, data2.s8, data0.s9, + data1.s9, data2.s9, data0.sA, + data1.sA); + vstore16(out1, 0, dst.ptr + 16); + + uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB, + data2.sB, data0.sC, data1.sC, + data2.sC, data0.sD, data1.sD, + data2.sD, data0.sE, data1.sE, + data2.sE, data0.sF, data1.sF, + data2.sF); + vstore16(out2, 0, dst.ptr + 32); +} + +/** This function combines three planes to a single RGBA image. + * + * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] plane3_ptr Pointer to the fourth plane. Supported Format: U8 + * @param[in] plane3_stride_x Stride of the fourth plane in X dimension (in bytes) + * @param[in] plane3_step_x plane3_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane3_stride_y Stride of the fourth plane in Y dimension (in bytes) + * @param[in] plane3_step_y plane3_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane + * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGBA + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_combine_RGBA8888( + IMAGE_DECLARATION(plane0), + IMAGE_DECLARATION(plane1), + IMAGE_DECLARATION(plane2), + IMAGE_DECLARATION(plane3), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); + Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); + Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); + Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data0 = vload16(0, plane0.ptr); + uchar16 data1 = vload16(0, plane1.ptr); + uchar16 data2 = vload16(0, plane2.ptr); + uchar16 data3 = vload16(0, plane3.ptr); + + uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0, + data0.s1, data1.s1, data2.s1, data3.s1, + data0.s2, data1.s2, data2.s2, data3.s2, + data0.s3, data1.s3, data2.s3, data3.s3); + vstore16(out0, 0, dst.ptr); + + uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4, + data0.s5, data1.s5, data2.s5, data3.s5, + data0.s6, data1.s6, data2.s6, data3.s6, + data0.s7, data1.s7, data2.s7, data3.s7); + vstore16(out1, 0, dst.ptr + 16); + + uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8, + data0.s9, data1.s9, data2.s9, data3.s9, + data0.sA, data1.sA, data2.sA, data3.sA, + data0.sB, data1.sB, data2.sB, data3.sB); + vstore16(out2, 0, dst.ptr + 32); + + uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC, + data0.sD, data1.sD, data2.sD, data3.sD, + data0.sE, data1.sE, data2.sE, data3.sE, + data0.sF, data1.sF, data2.sF, data3.sF); + vstore16(out3, 0, dst.ptr + 48); +} + +/** This function combines three planes to a single YUYV image. + * + * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] dst_ptr Pointer to the destination image. Supported Format: YUYV + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_combine_YUYV422( + IMAGE_DECLARATION(plane0), + IMAGE_DECLARATION(plane1), + IMAGE_DECLARATION(plane2), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); + Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); + Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data0 = vload16(0, plane0.ptr); + uchar8 data1 = vload8(0, plane1.ptr); + uchar8 data2 = vload8(0, plane2.ptr); + + uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0, + data0.s2, data1.s1, data0.s3, data2.s1, + data0.s4, data1.s2, data0.s5, data2.s2, + data0.s6, data1.s3, data0.s7, data2.s3); + vstore16(out0, 0, dst.ptr); + uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4, + data0.sA, data1.s5, data0.sB, data2.s5, + data0.sC, data1.s6, data0.sD, data2.s6, + data0.sE, data1.s7, data0.sF, data2.s7); + vstore16(out1, 0, dst.ptr + 16); +} + +/** This function combines three planes to a single UYUV image. + * + * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] dst_ptr Pointer to the destination image. Supported Format: UYUV + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_combine_UYVY422( + IMAGE_DECLARATION(plane0), + IMAGE_DECLARATION(plane1), + IMAGE_DECLARATION(plane2), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); + Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); + Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data0 = vload16(0, plane0.ptr); + uchar8 data1 = vload8(0, plane1.ptr); + uchar8 data2 = vload8(0, plane2.ptr); + + uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1, + data1.s1, data0.s2, data2.s1, data0.s3, + data1.s2, data0.s4, data2.s2, data0.s5, + data1.s3, data0.s6, data2.s3, data0.s7); + vstore16(out0, 0, dst.ptr); + uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9, + data1.s5, data0.sA, data2.s5, data0.sB, + data1.s6, data0.sC, data2.s6, data0.sD, + data1.s7, data0.sE, data2.s7, data0.sF); + vstore16(out1, 0, dst.ptr + 16); +} + +/** This function combines three planes to a single NV12/NV21 image. + * + * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine. + * + * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 + * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) + * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) + * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image + * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: UV88 + * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) + * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) + * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image + * @param[in] height Sub-sampled height + */ +__kernel void channel_combine_NV( + IMAGE_DECLARATION(src_plane0), + IMAGE_DECLARATION(src_plane1), + IMAGE_DECLARATION(src_plane2), + IMAGE_DECLARATION(dst_plane0), + IMAGE_DECLARATION(dst_plane1), + uint height) +{ + // Get pixels pointer + Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); + Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); + Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); + Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); + Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); + + // Copy plane data + vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); + vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); + + // Create UV place + uchar8 data1 = vload8(0, src_plane1.ptr); + uchar8 data2 = vload8(0, src_plane2.ptr); + +#if defined NV12 + vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr); +#elif defined NV21 + vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr); +#endif +} + +/** This function combines three planes to a single YUV444 or IYUV image. + * + * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine. + * + * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 + * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) + * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) + * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane + * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 + * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) + * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) + * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane + * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 + * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) + * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) + * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane + * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 + * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) + * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) + * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image + * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: U8 + * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) + * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) + * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image + * @param[in] dst_plane2_ptr Pointer to the third plane of the destination image. Supported Format: U8 + * @param[in] dst_plane2_stride_x Stride of the third plane of the destination image in X dimension (in bytes) + * @param[in] dst_plane2_step_x dst_plane2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_plane2_stride_y Stride of the third plane of the destination image in Y dimension (in bytes) + * @param[in] dst_plane2_step_y dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image + * @param[in] height Sub-sampled height + */ +__kernel void copy_planes_3p( + IMAGE_DECLARATION(src_plane0), + IMAGE_DECLARATION(src_plane1), + IMAGE_DECLARATION(src_plane2), + IMAGE_DECLARATION(dst_plane0), + IMAGE_DECLARATION(dst_plane1), + IMAGE_DECLARATION(dst_plane2), + uint height) +{ + // Get pixels pointer + Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); + Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); + Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); + Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); + Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); + Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2); + + // Copy plane data + vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); +#if defined YUV444 + vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr); + vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr); +#elif defined IYUV + vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); + vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr); + vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr); +#endif +} diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl new file mode 100644 index 0000000000..14c6c8a92a --- /dev/null +++ b/src/core/CL/cl_kernels/channel_extract.cl @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function extracts a given channel from an RGB image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: RGB + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_RGB888( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + uchar8 data2 = vload8(0, src.ptr + 16); + +#if defined CHANNEL_R + vstore4(data.s0369, 0, dst.ptr); + vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4); +#elif defined CHANNEL_G + vstore4(data.s147A, 0, dst.ptr); + vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4); +#elif defined CHANNEL_B + vstore4(data.s258B, 0, dst.ptr); + vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4); +#endif +} + +/** This function extracts a given channel from an RGBA image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: RGBA + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_RGBA8888( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + uchar16 data2 = vload16(0, src.ptr + 16); + +#if defined CHANNEL_R + vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr); +#elif defined CHANNEL_G + vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr); +#elif defined CHANNEL_B + vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr); +#elif defined CHANNEL_A + vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr); +#endif +} + +/** This function extracts a given channel from an YUYV image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: YUYV + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_YUYV422( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + +#if defined CHANNEL_Y + vstore8(data.s02468ACE, 0, dst.ptr); +#elif defined CHANNEL_U + vstore4(data.s159D, 0, dst.ptr); +#elif defined CHANNEL_V + vstore4(data.s37BF, 0, dst.ptr); +#endif +} + +/** This function extracts a given channel from an UYUV image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: UYUV + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_UYVY422( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + +#if defined CHANNEL_Y + vstore8(data.s13579BDF, 0, dst.ptr); +#elif defined CHANNEL_U + vstore4(data.s048C, 0, dst.ptr); +#elif defined CHANNEL_V + vstore4(data.s26AE, 0, dst.ptr); +#endif +} + +/** This function extracts a given channel from an NV12 image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. + * @warning Only channels UV can be extracted using this kernel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: NV12 (UV88) + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_NV12( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + +#if defined CHANNEL_U + vstore8(data.s02468ACE, 0, dst.ptr); +#elif defined CHANNEL_V + vstore8(data.s13579BDF, 0, dst.ptr); +#endif +} + +/** This function extracts a given channel from an NV21 image. + * + * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. + * @warning Only channels UV can be extracted using this kernel. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: NV21 (UV88) + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void channel_extract_NV21( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 data = vload16(0, src.ptr); + +#if defined CHANNEL_U + vstore8(data.s13579BDF, 0, dst.ptr); +#elif defined CHANNEL_V + vstore8(data.s02468ACE, 0, dst.ptr); +#endif +} + +/** This function extracts a given plane from an multi-planar image. + * + * @param[in] src_ptr Pointer to the source image. Supported Format: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void copy_plane( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Copy plane data + vstore16(vload16(0, src.ptr), 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl new file mode 100644 index 0000000000..f5ec85ae76 --- /dev/null +++ b/src/core/CL/cl_kernels/color_convert.cl @@ -0,0 +1,1823 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Convert an RGB888 image to RGBX8888 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void RGB888_to_RGBA8888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + // handle 16 pixels every time + uchar16 rgb_0 = vload16(0, in.ptr); + uchar16 rgb_1 = vload16(0, in.ptr + 16); + uchar16 rgb_2 = vload16(0, in.ptr + 32); + + uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255); + uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255); + uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255); + uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255); + + vstore16(rgba_0, 0, out.ptr); + vstore16(rgba_1, 0, out.ptr + 16); + vstore16(rgba_2, 0, out.ptr + 32); + vstore16(rgba_3, 0, out.ptr + 48); +} + +/** Convert an RGB888 image to RGBX8888 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void RGBA8888_to_RGB888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + // handle 16 pixels every time + uchar16 rgba_0 = vload16(0, in.ptr); + uchar16 rgba_1 = vload16(0, in.ptr + 16); + uchar16 rgba_2 = vload16(0, in.ptr + 32); + uchar16 rgba_3 = vload16(0, in.ptr + 48); + + uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124); + uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689); + uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde); + + vstore16(rgb_0, 0, out.ptr); + vstore16(rgb_1, 0, out.ptr + 16); + vstore16(rgb_2, 0, out.ptr + 32); +} + +/** Convert a UYVY422 image to RGB888 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void UYVY422_to_RGB888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + // handle 8 pixels every time + uchar16 uyvy = vload16(0, in.ptr); + + uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); + char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); + char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); + + float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr); + float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr); + float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr); + + uchar8 r_0 = convert_uchar8_rtz(f_r); + uchar8 g_0 = convert_uchar8_rtz(f_g); + uchar8 b_0 = convert_uchar8_rtz(f_b); + + uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, + r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); + uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); + + vstore16(rgb_0, 0, out.ptr); + vstore8(rgb_1, 0, out.ptr + 16); +} + +/** Convert a UYVY422 image to RGBX8888 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void UYVY422_to_RGBA8888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + // handle 8 pixels every time + uchar16 uyvy = vload16(0, in.ptr); + + uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); + char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); + char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); + + float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr); + float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr); + float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr); + + uchar8 r_0 = convert_uchar8_rtz(f_r); + uchar8 g_0 = convert_uchar8_rtz(f_g); + uchar8 b_0 = convert_uchar8_rtz(f_b); + + uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, + r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, + r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); + + vstore16(rgba_0, 0, out.ptr); + vstore16(rgba_1, 0, out.ptr + 16); +} + +/** Convert a YUYV422 image to RGB888 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void YUYV422_to_RGB888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + // handle 8 pixels every time + uchar16 uyvy = vload16(0, in.ptr); + + uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); + char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); + char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); + + float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr); + float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr); + float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr); + + uchar8 r_0 = convert_uchar8_rtz(f_r); + uchar8 g_0 = convert_uchar8_rtz(f_g); + uchar8 b_0 = convert_uchar8_rtz(f_b); + + uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, + r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); + uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); + + vstore16(rgb_0, 0, out.ptr); + vstore8(rgb_1, 0, out.ptr + 16); +} + +/** Convert a YUYV422 image to RGBX8888 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void YUYV422_to_RGBA8888_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + // handle 8 pixels every time + uchar16 uyvy = vload16(0, in.ptr); + + uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); + char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); + char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); + + float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr); + float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr); + float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr); + + uchar8 r_0 = convert_uchar8_rtz(f_r); + uchar8 g_0 = convert_uchar8_rtz(f_g); + uchar8 b_0 = convert_uchar8_rtz(f_b); + + uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, + r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, + r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); + + vstore16(rgba_0, 0, out.ptr); + vstore16(rgba_1, 0, out.ptr + 16); +} + +/** Convert a RGB image to NV12 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel + * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 + * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) + * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel + * + */ +__kernel void RGB888_to_NV12_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(luma), + IMAGE_DECLARATION(uv)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); + Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); + + // handle 4 pixels every time, two lines, each line for 2 pixels + // Read 2 pixel of the first line + uchar8 rgb_0 = vload8(0, in.ptr); + uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); + uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); + uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); + + float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); + float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); + float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); + + short2 i_y = convert_short2_rtz(f_y); + short2 i_u = convert_short2_rtz(f_u) + (short2)(128); + short2 i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_0, 0, out_y.ptr); + + uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + + // Read 2 pixel of the second line + uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); + uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); + uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); + uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); + + f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); + f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); + f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); + + i_y = convert_short2_rtz(f_y); + i_u = convert_short2_rtz(f_u) + (short2)(128); + i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_1, 0, out_y.ptr + luma_stride_y); + + uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), + ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); + + vstore2(cbcr, 0, out_uv.ptr); +} + +/* + R'= Y' + 0.0000*U + 1.5748*V + G'= Y' - 0.1873*U - 0.4681*V + B'= Y' + 1.8556*U + 0.0000*V +*/ + +/** Convert an NV12 image to RGB888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void NV12_to_RGB888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(rgb_output)) +{ + Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); + + // handle 8 pixels every time, two lines, each line for 4 pixels + uchar4 luma_0 = vload4(0, in_luma.ptr); + uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); + uchar4 cbcr = vload4(0, in_uv.ptr); + char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); + char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore4(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); + vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); +} + +/** Convert a RGB image to YUV444 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void RGB888_to_YUV444_bt709( + IMAGE_DECLARATION(rgb_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + // handle 4 pixels every time + Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // Read 4 pixel + uchar16 rgb_0 = vload16(0, in_rgb.ptr); + uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9); + uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa); + uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb); + + float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); + float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); + float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); + + short4 i_y = convert_short4_rtz(f_y); + short4 i_u = convert_short4_rtz(f_u) + (short4)(128); + short4 i_v = convert_short4_rtz(f_v) + (short4)(128); + + uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255)))); + vstore4(luma_0, 0, out_y.ptr); + + uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255)))); + uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255)))); + vstore4(cb_0, 0, out_u.ptr); + vstore4(cr_0, 0, out_v.ptr); +} + +/** Convert a RGB image to IYUV using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 2), height ] + * No offset. + * + * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void RGB888_to_IYUV_bt709( + IMAGE_DECLARATION(rgb_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + // handle 4 pixels every time, two lines, each line for 2 pixels + Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // Read 2 pixel of the first line + uchar8 rgb_0 = vload8(0, in_rgb.ptr); + uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); + uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); + uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); + + float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); + float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); + float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); + + short2 i_y = convert_short2_rtz(f_y); + short2 i_u = convert_short2_rtz(f_u) + (short2)(128); + short2 i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_0, 0, out_y.ptr); + + uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + + // Read 2 pixel of the second line + uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y); + uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); + uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); + uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); + + f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); + f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); + f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); + + i_y = convert_short2_rtz(f_y); + i_u = convert_short2_rtz(f_u) + (short2)(128); + i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); + + uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), + ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); + *out_u.ptr = cbcr.x; + *out_v.ptr = cbcr.y; +} + +/** Convert a RGBA image to YUV444 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void RGBA8888_to_YUV444_bt709( + IMAGE_DECLARATION(rgba_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + // handle 4 pixels every time + Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // Read 4 pixel + uchar16 rgb_0 = vload16(0, in_rgba.ptr); + uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc); + uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd); + uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se); + + float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); + float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); + float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); + + short4 i_y = convert_short4(f_y); + short4 i_u = convert_short4(f_u) + (short4)(128); + short4 i_v = convert_short4(f_v) + (short4)(128); + + uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255)))); + vstore4(luma_0, 0, out_y.ptr); + + uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255)))); + uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255)))); + vstore4(cb_0, 0, out_u.ptr); + vstore4(cr_0, 0, out_v.ptr); +} + +/** Convert a RGBA image to NV12 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 2), height ] + * No offset. + * + * @param[in] input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel + * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8 + * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes) + * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes) + * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel + * + */ +__kernel void RGBA8888_to_NV12_bt709( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(uv_output)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); + + // Read 2 pixel of the first line + uchar8 rgb_0 = vload8(0, in.ptr); + uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); + uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); + uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); + + float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); + float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); + float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); + + short2 i_y = convert_short2_rtz(f_y); + short2 i_u = convert_short2_rtz(f_u) + (short2)(128); + short2 i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_0, 0, out_y.ptr); + + uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + + // Read 2 pixel of the second line + uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); + uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); + uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); + uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); + + f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); + f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); + f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); + + i_y = convert_short2_rtz(f_y); + i_u = convert_short2_rtz(f_u) + (short2)(128); + i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); + + uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), + ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); + vstore2(cbcr, 0, out_uv.ptr); +} + +/** Convert a RGBA image to IYUV using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 2), height ] + * No offset. + * + * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void RGBA8888_to_IYUV_bt709( + IMAGE_DECLARATION(rgba_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + // handle 4 pixels every time, two lines, each line for 2 pixels + Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // Read 2 pixel of the first line + uchar8 rgb_0 = vload8(0, in_rgb.ptr); + uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); + uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); + uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); + + float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); + float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); + float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); + + short2 i_y = convert_short2_rtz(f_y); + short2 i_u = convert_short2_rtz(f_u) + (short2)(128); + short2 i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_0, 0, out_y.ptr); + + uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + + // Read 2 pixel of the second line + uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y); + uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); + uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); + uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); + + f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); + f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); + f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); + + i_y = convert_short2_rtz(f_y); + i_u = convert_short2_rtz(f_u) + (short2)(128); + i_v = convert_short2_rtz(f_v) + (short2)(128); + + uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); + vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); + + uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); + uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); + uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), + ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); + *out_u.ptr = cbcr.x; + *out_v.ptr = cbcr.y; +} + +/** Convert an NV12 image to RGB8888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void NV12_to_RGBA8888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(rgb_output)) +{ + Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); + + uchar4 luma_0 = vload4(0, in_luma.ptr); + uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); + uchar4 cbcr = vload4(0, in_uv.ptr); + char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); + char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore8(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); + vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); +} + +/** Convert an NV12 image to IYUV + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + */ +__kernel void NV12_to_IYUV_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 32 pixels every time, two lines, each line for 16 pixels + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar16 cbcr = vload16(0, in_uv.ptr); + uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); + uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore8(cb, 0, out_u.ptr); + vstore8(cr, 0, out_v.ptr); +} + +/** Convert an NV12 image to YUV444 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + */ +__kernel void NV12_to_YUV444_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 32 pixels every time, two lines, each line for 16 pixels + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar16 cbcr = vload16(0, in_uv.ptr); + uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, + cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); + uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, + cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore16(cb, 0, out_u.ptr); + vstore16(cb, 0, out_u.ptr + u_output_stride_y); + vstore16(cr, 0, out_v.ptr); + vstore16(cr, 0, out_v.ptr + v_output_stride_y); +} + +/** Convert an NV21 image to RGB888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void NV21_to_RGB888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(rgb_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); + + // handle 8 pixels every time, two lines, each line for 4 pixels + uchar4 luma_0 = vload4(0, in_y.ptr); + uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); + uchar4 cbcr = vload4(0, in_uv.ptr); + char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); + char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore4(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); + vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); +} + +/** Convert an NV12 image to RGB8888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void NV21_to_RGBA8888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(rgba_output)) +{ + Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); + + // handle 8 pixels every time, two lines, each line for 4 pixels + uchar4 luma_0 = vload4(0, in_luma.ptr); + uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); + uchar4 cbcr = vload4(0, in_uv.ptr); + char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); + char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore8(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); + vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); +} + +/** Convert an NV21 image to YUV444 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + */ +__kernel void NV21_to_YUV444_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 32 pixels every time, two lines, each line for 16 pixels + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar16 cbcr = vload16(0, in_uv.ptr); + uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, + cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); + uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, + cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore16(cb, 0, out_u.ptr); + vstore16(cb, 0, out_u.ptr + u_output_stride_y); + vstore16(cr, 0, out_v.ptr); + vstore16(cr, 0, out_v.ptr + v_output_stride_y); +} + +/** Convert an NV21 image to IYUV + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 + * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) + * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + */ +__kernel void NV21_to_IYUV_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(uv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar16 cbcr = vload16(0, in_uv.ptr); + uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); + uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore8(cb, 0, out_u.ptr); + vstore8(cr, 0, out_v.ptr); +} + +/** Convert a UYVY image to IYUV using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void UYVY422_to_IYUV_bt709( + IMAGE_DECLARATION(uyvy_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 16 pixels every time, each line 8 pixels + uchar16 uyvy = vload16(0, in_uyvy.ptr); + uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); + ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); + ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); + vstore8(luma, 0, out_y.ptr); + + uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y); + luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); + ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); + ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); + vstore8(luma, 0, out_y.ptr + luma_output_stride_y); + + uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); + uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); + vstore4(cb, 0, out_u.ptr); + vstore4(cr, 0, out_v.ptr); +} + +/** Convert a YUYV image to IYUV using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void YUYV422_to_IYUV_bt709( + IMAGE_DECLARATION(yuyv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 16 pixels every time, each line 8 pixels + uchar16 yuyv = vload16(0, in_yuyv.ptr); + uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); + ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); + ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); + vstore8(luma, 0, out_y.ptr); + + yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); + luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); + ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); + ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); + vstore8(luma, 0, out_y.ptr + luma_output_stride_y); + + uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); + uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); + vstore4(cb, 0, out_u.ptr); + vstore4(cr, 0, out_v.ptr); +} + +/** Convert an IYUV image to RGB888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 + * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) + * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel + * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 + * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) + * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) + * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel + * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void IYUV_to_RGB888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(u_input), + IMAGE_DECLARATION(v_input), + IMAGE_DECLARATION(rgb_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); + Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); + + // handle 8 pixels every time, two lines, each line for 4 pixels + uchar4 luma_0 = vload4(0, in_y.ptr); + uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); + uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); + char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); + char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore4(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); + rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); + vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); + vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); +} + +/** Convert an IYUV image to RGB8888 + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 + * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) + * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel + * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 + * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) + * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) + * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel + * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 + * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void IYUV_to_RGBA8888_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(u_input), + IMAGE_DECLARATION(v_input), + IMAGE_DECLARATION(rgba_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); + Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); + Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); + + // handle 8 pixels every time, two lines, each line for 4 pixels + uchar4 luma_0 = vload4(0, in_y.ptr); + uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); + uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); + char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); + char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); + + float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); + float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); + float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); + + float4 f_r = convert_float4(luma_0) + temp0; + float4 f_g = convert_float4(luma_0) + temp1; + float4 f_b = convert_float4(luma_0) + temp2; + + uchar4 r_0 = convert_uchar4_rtz(f_r); + uchar4 g_0 = convert_uchar4_rtz(f_g); + uchar4 b_0 = convert_uchar4_rtz(f_b); + + uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr); + vstore8(rgb_1, 0, out_rgb.ptr + 8); + + f_r = convert_float4(luma_1) + temp0; + f_g = convert_float4(luma_1) + temp1; + f_b = convert_float4(luma_1) + temp2; + + r_0 = convert_uchar4_rtz(f_r); + g_0 = convert_uchar4_rtz(f_g); + b_0 = convert_uchar4_rtz(f_b); + + rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); + rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); + vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); + vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); +} + +/** Convert an IYUV image to YUV444 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 + * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) + * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel + * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 + * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) + * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) + * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 + * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) + * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel + * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 + * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) + * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) + * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel + * + */ +__kernel void IYUV_to_YUV444_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(u_input), + IMAGE_DECLARATION(v_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(u_output), + IMAGE_DECLARATION(v_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); + Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); + Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); + + // handle 32 pixels every time, two lines, each line for 16 pixels + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar8 cb_src = vload8(0, in_u.ptr); + uchar8 cr_src = vload8(0, in_v.ptr); + uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3, + cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7); + uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3, + cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore16(cb, 0, out_u.ptr); + vstore16(cb, 0, out_u.ptr + u_output_stride_y); + vstore16(cr, 0, out_v.ptr); + vstore16(cr, 0, out_v.ptr + v_output_stride_y); +} + +/** Convert an IYUV image to NV12 + * + * Global Workgroup Size [ DIV_CEIL(width, 16), height ] + * No offset. + * + * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 + * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) + * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) + * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 + * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) + * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel + * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 + * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) + * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) + * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 + * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) + * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) + * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel + * + */ +__kernel void IYUV_to_NV12_bt709( + IMAGE_DECLARATION(luma_input), + IMAGE_DECLARATION(u_input), + IMAGE_DECLARATION(v_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(uv_output)) +{ + Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); + Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); + Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); + + // handle 32 pixels every time, two lines, each line for 16 pixels + uchar16 luma_0 = vload16(0, in_y.ptr); + uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); + uchar8 cb = vload8(0, in_u.ptr); + uchar8 cr = vload8(0, in_v.ptr); + uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6, + cr.s6, cb.s7, cr.s7); + + vstore16(luma_0, 0, out_y.ptr); + vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); + vstore16(cbcr, 0, out_uv.ptr); +} + +/** Convert a YUYV image to NV12 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * No offset. + * + * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 + * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel + * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 + * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) + * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes) + * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel + * + */ +__kernel void YUYV422_to_NV12_bt709( + IMAGE_DECLARATION(yuyv_input), + IMAGE_DECLARATION(luma_output), + IMAGE_DECLARATION(uv_output)) +{ + Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); + Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); + + // handle 16 pixels every time, each line 8 pixels + uchar16 yuyv = vload16(0, in_yuyv.ptr); + ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); + uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); + vstore8(luma, 0, out_y.ptr); + + yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); + ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); + luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); + vstore8(luma, 0, out_y.ptr + luma_output_stride_y); + + uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2)); + vstore8(cbcr, 0, out_uv.ptr); +} + +/** Convert a UYVY image to NV12 using BT709 color space + * + * Global Workgroup Size [ DIV_CEIL(width, 4), height ] + * No offset. + * + * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8 + * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 + * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) + * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel + * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 + * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) + * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) + * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel + * + */ +__kernel void UYVY422_to_NV12_bt709( + IMAGE_DECLARATION(input_uyvy), + IMAGE_DECLARATION(luma), + IMAGE_DECLARATION(uv)) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy); + Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); + Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); + + // handle 16 pixels every time, each line 8 pixels + const uchar16 uyvy_t = vload16(0, in.ptr); + vstore8(uyvy_t.s13579bdf, 0, out_y.ptr); + + const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y); + vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y); + + const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se); + const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se); + const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2)); + vstore8(cbcr, 0, out_uv.ptr); +} diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl new file mode 100644 index 0000000000..00f5189508 --- /dev/null +++ b/src/core/CL/cl_kernels/concatenate.cl @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This kernel concatenates the input tensor into the output tensor along the third dimension + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] offset The offset to the first valid element of the output tensor in bytes + */ +__kernel void concatenate_depth( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst), + unsigned int offset) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + float4 source_values = vload4(0, (__global float *)src.ptr); + + vstore4(source_values, 0, (__global float *)(dst.ptr + offset)); +} diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl new file mode 100644 index 0000000000..3733d0c733 --- /dev/null +++ b/src/core/CL/cl_kernels/convolution3x3.cl @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifndef DATA_TYPE +#define DATA_TYPE short +#endif + +#ifndef DATA_TYPE_OUT +#define DATA_TYPE_OUT uchar +#endif + +/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] left_pixel Pointer to the left pixel. + * @param[in] left_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right_coeff Weight of the right pixel + * + * @return a short8 containing 8 convoluted values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel, + const short left_coeff, + const short middle_coeff, + const short right_coeff) +{ + uchar16 temp = vload16(0, left_pixel); + VEC_DATA_TYPE(DATA_TYPE, 8) + left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); + + return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff; +} + +/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result. + * + * Convolution matrix layout: + * + * [ mat0, mat1, mat2 ]\n + * [ mat3, mat4, mat5 ]\n + * [ mat6, mat7, mat8 ]\n + * + * @param[in] src A pointer to source Image structure + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat1 Coefficient from the convolution matrix + * @param[in] mat2 Coefficient from the convolution matrix + * @param[in] mat3 Coefficient from the convolution matrix + * @param[in] mat4 Coefficient from the convolution matrix + * @param[in] mat5 Coefficient from the convolution matrix + * @param[in] mat6 Coefficient from the convolution matrix + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat7 Coefficient from the convolution matrix + * @param[in] mat8 Coefficient from the convolution matrix + * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) + * + * @return a short8 containing 8 convoluted and scaled values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3( + Image *src, + const short mat0, const short mat1, const short mat2, + const short mat3, const short mat4, const short mat5, + const short mat6, const short mat7, const short mat8, uint scale) +{ + // Output pixels + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels; + + // Row 0 + pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2); + // Row + pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5); + // Row 2 + pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8); + + // Divide by the scale + return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale; +} + +#ifndef DYNAMIC_MATRIX_CONVOLUTION + +/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image. + * + * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n + * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution3x3_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = convolution3x3(&src, + MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE); + + // Store the result as is in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl new file mode 100644 index 0000000000..d1335c5558 --- /dev/null +++ b/src/core/CL/cl_kernels/convolution5x5.cl @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifndef DATA_TYPE +#define DATA_TYPE short +#endif + +#ifndef COMPUTE_TYPE +#define COMPUTE_TYPE int +#endif + +#ifndef DATA_TYPE_OUT +#define DATA_TYPE_OUT uchar +#endif + +/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] left_pixel Pointer to the left pixel + * @param[in] left1_coeff Weight of the most left pixel + * @param[in] left2_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right1_coeff Weight of the right pixel + * @param[in] right2_coeff Weight of the most right pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(DATA_TYPE, 8) +convolution1x5( + __global const uchar *left_pixel, + const short left1_coeff, + const short left2_coeff, + const short middle_coeff, + const short right1_coeff, + const short right2_coeff) +{ + uchar16 temp = vload16(0, left_pixel); + + VEC_DATA_TYPE(DATA_TYPE, 8) + left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); + + return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff; +} + +/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] src Pointer to source image. + * @param[in] up1_coeff Weight of the most up pixel + * @param[in] up2_coeff Weight of the up pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] down1_coeff Weight of the down pixel + * @param[in] down2_coeff Weight of the most down pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(COMPUTE_TYPE, 8) +convolution5x1( + Image *src, + const short up1_coeff, + const short up2_coeff, + const short middle_coeff, + const short down1_coeff, + const short down2_coeff) +{ + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + val; + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; + + return out; +} + +/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result. + * + * Convolution matrix layout:\n + * [ mat0, mat1, mat2, mat3 , mat4 ]\n + * [ mat5, mat6, mat7, mat8, mat9 ]\n + * [ mat10, mat11, mat12, mat13, mat14 ]\n + * [ mat15, mat16, mat17, mat18, mat19 ]\n + * [ mat20, mat21, mat22, mat23, mat24 ] + * + * @param[in] src A pointer to source Image structure. + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat1 Coefficient from the convolution matrix + * @param[in] mat2 Coefficient from the convolution matrix + * @param[in] mat3 Coefficient from the convolution matrix + * @param[in] mat4 Coefficient from the convolution matrix + * @param[in] mat5 Coefficient from the convolution matrix + * @param[in] mat6 Coefficient from the convolution matrix + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat7 Coefficient from the convolution matrix + * @param[in] mat8 Coefficient from the convolution matrix + * @param[in] mat9 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat11 Coefficient from the convolution matrix + * @param[in] mat12 Coefficient from the convolution matrix + * @param[in] mat13 Coefficient from the convolution matrix + * @param[in] mat14 Coefficient from the convolution matrix + * @param[in] mat15 Coefficient from the convolution matrix + * @param[in] mat16 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat17 Coefficient from the convolution matrix + * @param[in] mat18 Coefficient from the convolution matrix + * @param[in] mat19 Coefficient from the convolution matrix + * @param[in] mat20 Coefficient from the convolution matrix + * @param[in] mat21 Coefficient from the convolution matrix + * @param[in] mat22 Coefficient from the convolution matrix + * @param[in] mat23 Coefficient from the convolution matrix + * @param[in] mat24 Coefficient from the convolution matrix + * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) + * + * @return a short8 containing 8 convoluted and scaled values. + */ +short8 convolution5x5( + Image *src, + const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, + const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, + const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, + const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, + const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, + uint scale) +{ + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels; + + pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4); + pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9); + pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14); + pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19); + pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24); + + if(scale > 0) + { + pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; + } + + return convert_short8_sat(pixels); +} + +#ifndef DYNAMIC_MATRIX_CONVOLUTION + +/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32). + * + * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n + * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable1x5_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4); + + // Store result in dst + vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); +} + +/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image. + * + * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable5x1_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9); + + // Divide by the scale + pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; + + // Store result in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders + * + * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution5x5_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + short8 pixels = convolution5x5(&src, + MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, + MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE); + + // Store the result as is in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl new file mode 100644 index 0000000000..74a0055370 --- /dev/null +++ b/src/core/CL/cl_kernels/convolution7x7.cl @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifndef DATA_TYPE +#define DATA_TYPE short +#endif + +#ifndef COMPUTE_TYPE +#define COMPUTE_TYPE int +#endif + +#ifndef DATA_TYPE_OUT +#define DATA_TYPE_OUT uchar +#endif + +/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] left_pixel Pointer to the left pixel + * @param[in] left1_coeff Weight of the most left pixel + * @param[in] left2_coeff Weight of the second left pixel + * @param[in] left3_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right1_coeff Weight of the right pixel + * @param[in] right2_coeff Weight of the second right pixel + * @param[in] right3_coeff Weight of the most right pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(DATA_TYPE, 8) +convolution1x7( + __global const uchar *left_pixel, + const short left1_coeff, + const short left2_coeff, + const short left3_coeff, + const short middle_coeff, + const short right1_coeff, + const short right2_coeff, + const short right3_coeff) +{ + uchar16 temp = vload16(0, left_pixel); + + VEC_DATA_TYPE(DATA_TYPE, 8) + left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); + + return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, + 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff; +} + +/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] src Pointer to source image. + * @param[in] up1_coeff Weight of the most up pixel + * @param[in] up2_coeff Weight of the second up pixel + * @param[in] up3_coeff Weight of the up pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] down1_coeff Weight of the down pixel + * @param[in] down2_coeff Weight of the second down pixel + * @param[in] down3_coeff Weight of the third down pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(COMPUTE_TYPE, 8) +convolution7x1( + Image *src, + const short up1_coeff, + const short up2_coeff, + const short up3_coeff, + const short middle_coeff, + const short down1_coeff, + const short down2_coeff, + const short down3_coeff) +{ + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + val; + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; + + return out; +} + +/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result. + * + * Convolution matrix layout:\n + * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6 ]\n + * [ mat7, mat8, mat9, mat10, mat11, mat12, mat13 ]\n + * [ mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n + * [ mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n + * [ mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n + * [ mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n + * [ mat42, mat43, mat44, mat45, mat46, mat47, mat48 ] + * + * @param[in] src A pointer to source Image structure. + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat1 Coefficient from the convolution matrix + * @param[in] mat2 Coefficient from the convolution matrix + * @param[in] mat3 Coefficient from the convolution matrix + * @param[in] mat4 Coefficient from the convolution matrix + * @param[in] mat5 Coefficient from the convolution matrix + * @param[in] mat6 Coefficient from the convolution matrix + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat7 Coefficient from the convolution matrix + * @param[in] mat8 Coefficient from the convolution matrix + * @param[in] mat9 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat11 Coefficient from the convolution matrix + * @param[in] mat12 Coefficient from the convolution matrix + * @param[in] mat13 Coefficient from the convolution matrix + * @param[in] mat14 Coefficient from the convolution matrix + * @param[in] mat15 Coefficient from the convolution matrix + * @param[in] mat16 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat17 Coefficient from the convolution matrix + * @param[in] mat18 Coefficient from the convolution matrix + * @param[in] mat19 Coefficient from the convolution matrix + * @param[in] mat20 Coefficient from the convolution matrix + * @param[in] mat21 Coefficient from the convolution matrix + * @param[in] mat22 Coefficient from the convolution matrix + * @param[in] mat23 Coefficient from the convolution matrix + * @param[in] mat24 Coefficient from the convolution matrix + * @param[in] mat25 Coefficient from the convolution matrix + * @param[in] mat26 Coefficient from the convolution matrix + * @param[in] mat27 Coefficient from the convolution matrix + * @param[in] mat28 Coefficient from the convolution matrix + * @param[in] mat29 Coefficient from the convolution matrix + * @param[in] mat30 Coefficient from the convolution matrix + * @param[in] mat31 Coefficient from the convolution matrix + * @param[in] mat32 Coefficient from the convolution matrix + * @param[in] mat33 Coefficient from the convolution matrix + * @param[in] mat34 Coefficient from the convolution matrix + * @param[in] mat35 Coefficient from the convolution matrix + * @param[in] mat36 Coefficient from the convolution matrix + * @param[in] mat37 Coefficient from the convolution matrix + * @param[in] mat38 Coefficient from the convolution matrix + * @param[in] mat39 Coefficient from the convolution matrix + * @param[in] mat40 Coefficient from the convolution matrix + * @param[in] mat41 Coefficient from the convolution matrix + * @param[in] mat42 Coefficient from the convolution matrix + * @param[in] mat43 Coefficient from the convolution matrix + * @param[in] mat44 Coefficient from the convolution matrix + * @param[in] mat45 Coefficient from the convolution matrix + * @param[in] mat46 Coefficient from the convolution matrix + * @param[in] mat47 Coefficient from the convolution matrix + * @param[in] mat48 Coefficient from the convolution matrix + * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) + * + */ +short8 convolution7x7( + Image *src, + const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, + const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, + const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, + const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, + const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, + const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, + const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, + const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, + const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, + const short mat45, const short mat46, const short mat47, const short mat48, uint scale) +{ + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels; + + pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6); + pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13); + pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20); + pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27); + pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34); + pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41); + pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48); + + if(scale > 0) + { + pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; + } + + return convert_short8_sat(pixels); +} + +#ifndef DYNAMIC_MATRIX_CONVOLUTION + +/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image. + * + * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n + * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable1x7_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6); + + // Store result in dst + vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); +} + +/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image. + * + * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable7x1_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13); + + // Divide by the scale + pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; + + // Store result in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders. + * + * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution7x7_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + short8 pixels = convolution7x7(&src, + MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, + MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, + MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, + MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE); + + // Clamp results to [ 0, 255 ] and store them in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl new file mode 100644 index 0000000000..d8b07cafac --- /dev/null +++ b/src/core/CL/cl_kernels/convolution9x9.cl @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifndef DATA_TYPE +#define DATA_TYPE short +#endif + +#ifndef COMPUTE_TYPE +#define COMPUTE_TYPE int +#endif + +#ifndef DATA_TYPE_OUT +#define DATA_TYPE_OUT uchar +#endif + +/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] left_pixel Pointer to the left pixel + * @param[in] left1_coeff Weight of the most left pixel + * @param[in] left2_coeff Weight of the second left pixel + * @param[in] left3_coeff Weight of the third left pixel + * @param[in] left4_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right1_coeff Weight of the right pixel + * @param[in] right2_coeff Weight of the second right pixel + * @param[in] right3_coeff Weight of the third right pixel + * @param[in] right4_coeff Weight of the most right pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(DATA_TYPE, 8) +convolution1x9( + __global const uchar *left_pixel, + const short left1_coeff, + const short left2_coeff, + const short left3_coeff, + const short left4_coeff, + const short middle_coeff, + const short right1_coeff, + const short right2_coeff, + const short right3_coeff, + const short right4_coeff) +{ + uchar16 temp = vload16(0, left_pixel); + + VEC_DATA_TYPE(DATA_TYPE, 8) + left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8)); + VEC_DATA_TYPE(DATA_TYPE, 8) + right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8)); + + return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE, + 8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, + 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff; +} + +/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] src Pointer to source image. + * @param[in] up1_coeff Weight of the most up pixel + * @param[in] up2_coeff Weight of the second up pixel + * @param[in] up3_coeff Weight of the third up pixel + * @param[in] up4_coeff Weight of the up pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] down1_coeff Weight of the down pixel + * @param[in] down2_coeff Weight of the second down pixel + * @param[in] down3_coeff Weight of the third down pixel + * @param[in] down4_coeff Weight of the most down pixel + * + * @return a short8 containing 8 convoluted values. + */ +VEC_DATA_TYPE(COMPUTE_TYPE, 8) +convolution9x1( + Image *src, + const short up1_coeff, + const short up2_coeff, + const short up3_coeff, + const short up4_coeff, + const short middle_coeff, + const short down1_coeff, + const short down2_coeff, + const short down3_coeff, + const short down4_coeff) +{ + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + val; + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; + + val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); + out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff; + + return out; +} + +/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result. + * + * Convolution matrix layout:\n + * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6, mat7, mat8 ]\n + * [ mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n + * [ mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n + * [ mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n + * [ mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n + * [ mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n + * [ mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ] + * [ mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ] + * [ mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ] + * + * @param[in] src A pointer to source Image structure. + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat1 Coefficient from the convolution matrix + * @param[in] mat2 Coefficient from the convolution matrix + * @param[in] mat3 Coefficient from the convolution matrix + * @param[in] mat4 Coefficient from the convolution matrix + * @param[in] mat5 Coefficient from the convolution matrix + * @param[in] mat6 Coefficient from the convolution matrix + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat7 Coefficient from the convolution matrix + * @param[in] mat8 Coefficient from the convolution matrix + * @param[in] mat9 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat11 Coefficient from the convolution matrix + * @param[in] mat12 Coefficient from the convolution matrix + * @param[in] mat13 Coefficient from the convolution matrix + * @param[in] mat14 Coefficient from the convolution matrix + * @param[in] mat15 Coefficient from the convolution matrix + * @param[in] mat16 Coefficient from the convolution matrix + * @param[in] mat10 Coefficient from the convolution matrix + * @param[in] mat17 Coefficient from the convolution matrix + * @param[in] mat18 Coefficient from the convolution matrix + * @param[in] mat19 Coefficient from the convolution matrix + * @param[in] mat20 Coefficient from the convolution matrix + * @param[in] mat21 Coefficient from the convolution matrix + * @param[in] mat22 Coefficient from the convolution matrix + * @param[in] mat23 Coefficient from the convolution matrix + * @param[in] mat24 Coefficient from the convolution matrix + * @param[in] mat25 Coefficient from the convolution matrix + * @param[in] mat26 Coefficient from the convolution matrix + * @param[in] mat27 Coefficient from the convolution matrix + * @param[in] mat28 Coefficient from the convolution matrix + * @param[in] mat29 Coefficient from the convolution matrix + * @param[in] mat30 Coefficient from the convolution matrix + * @param[in] mat31 Coefficient from the convolution matrix + * @param[in] mat32 Coefficient from the convolution matrix + * @param[in] mat33 Coefficient from the convolution matrix + * @param[in] mat34 Coefficient from the convolution matrix + * @param[in] mat35 Coefficient from the convolution matrix + * @param[in] mat36 Coefficient from the convolution matrix + * @param[in] mat37 Coefficient from the convolution matrix + * @param[in] mat38 Coefficient from the convolution matrix + * @param[in] mat39 Coefficient from the convolution matrix + * @param[in] mat40 Coefficient from the convolution matrix + * @param[in] mat41 Coefficient from the convolution matrix + * @param[in] mat42 Coefficient from the convolution matrix + * @param[in] mat43 Coefficient from the convolution matrix + * @param[in] mat44 Coefficient from the convolution matrix + * @param[in] mat45 Coefficient from the convolution matrix + * @param[in] mat46 Coefficient from the convolution matrix + * @param[in] mat47 Coefficient from the convolution matrix + * @param[in] mat48 Coefficient from the convolution matrix + * @param[in] mat49 Coefficient from the convolution matrix + * @param[in] mat50 Coefficient from the convolution matrix + * @param[in] mat51 Coefficient from the convolution matrix + * @param[in] mat52 Coefficient from the convolution matrix + * @param[in] mat53 Coefficient from the convolution matrix + * @param[in] mat54 Coefficient from the convolution matrix + * @param[in] mat55 Coefficient from the convolution matrix + * @param[in] mat56 Coefficient from the convolution matrix + * @param[in] mat57 Coefficient from the convolution matrix + * @param[in] mat58 Coefficient from the convolution matrix + * @param[in] mat59 Coefficient from the convolution matrix + * @param[in] mat60 Coefficient from the convolution matrix + * @param[in] mat61 Coefficient from the convolution matrix + * @param[in] mat62 Coefficient from the convolution matrix + * @param[in] mat63 Coefficient from the convolution matrix + * @param[in] mat64 Coefficient from the convolution matrix + * @param[in] mat65 Coefficient from the convolution matrix + * @param[in] mat66 Coefficient from the convolution matrix + * @param[in] mat67 Coefficient from the convolution matrix + * @param[in] mat68 Coefficient from the convolution matrix + * @param[in] mat69 Coefficient from the convolution matrix + * @param[in] mat70 Coefficient from the convolution matrix + * @param[in] mat71 Coefficient from the convolution matrix + * @param[in] mat72 Coefficient from the convolution matrix + * @param[in] mat73 Coefficient from the convolution matrix + * @param[in] mat74 Coefficient from the convolution matrix + * @param[in] mat75 Coefficient from the convolution matrix + * @param[in] mat76 Coefficient from the convolution matrix + * @param[in] mat76 Coefficient from the convolution matrix + * @param[in] mat77 Coefficient from the convolution matrix + * @param[in] mat78 Coefficient from the convolution matrix + * @param[in] mat79 Coefficient from the convolution matrix + * @param[in] mat80 Coefficient from the convolution matrix + * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) + * + */ +short8 convolution9x9( + Image *src, + const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, + const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, + const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, + const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, + const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, + const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, + const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, + const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, + const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, + const short mat45, const short mat46, const short mat47, const short mat48, const short mat49, + const short mat50, const short mat51, const short mat52, const short mat53, const short mat54, + const short mat55, const short mat56, const short mat57, const short mat58, const short mat59, + const short mat60, const short mat61, const short mat62, const short mat63, const short mat64, + const short mat65, const short mat66, const short mat67, const short mat68, const short mat69, + const short mat70, const short mat71, const short mat72, const short mat73, const short mat74, + const short mat75, const short mat76, const short mat77, const short mat78, const short mat79, + const short mat80, uint scale) +{ + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels; + + pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8); + pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17); + pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26); + pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35); + pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44); + pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53); + pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62); + pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71); + pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80); + + if(scale > 0) + { + pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; + } + + return convert_short8_sat(pixels); +} + +#ifndef DYNAMIC_MATRIX_CONVOLUTION + +/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image. + * + * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n + * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable1x9_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8); + + // Store result in dst + vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); +} + +/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image. + * + * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_separable9x1_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Output pixels + VEC_DATA_TYPE(COMPUTE_TYPE, 8) + pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17); + + // Divide by the scale + pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; + + // Store result in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders + * + * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution9x9_static( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + short8 pixels = convolution9x9(&src, + MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, + MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, + MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, + MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49, + MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, + MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73, + MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE); + + // Store the result as is in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); +} + +#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl new file mode 100644 index 0000000000..bd5dfaff68 --- /dev/null +++ b/src/core/CL/cl_kernels/convolution_layer.cl @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This kernel reshapes the tensor's low three dimensions to single column + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as input + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] bias_ptr Pointer to the bias tensor. Same as input + * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes) + * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + * @param[in] depth The depth of the input tensor + * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix + */ +__kernel void reshape_to_columns( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), +#if defined HAS_BIAS + VECTOR_DECLARATION(bias), +#endif + uint width, uint height, uint depth, uint total_filters) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)); + + __global uchar *tmp_src_ptr = src.ptr; + __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id( + 2) * width * height * dst_stride_y; +#if defined HAS_BIAS + __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes; +#endif + + if(is_last_thread) + { + for(uint i = 0; i < total_filters; ++i) + { + *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); + +#if defined HAS_BIAS + *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr)); + tmp_bias_ptr += bias_stride_x; +#endif + tmp_src_ptr += depth * src_stride_z; + tmp_dst_ptr += dst_stride_x; + } + } + else + { + for(uint i = 0; i < total_filters; ++i) + { + *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); + tmp_src_ptr += depth * src_stride_z; + tmp_dst_ptr += dst_stride_x; + } + } +} + +/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F16, F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] kernel_size The convolution kernel size + * @param[in] kernel_depth The kernel depth + * @param[in] width The output tensor width + * @param[in] input_dims The input tensor dimensions + * @param[in] strides The strides of the im2col operation + * @param[in] paddings The input tensor paddings + */ +__kernel void im2col_generic( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + int kernel_size, + int kernel_depth, + int width, + int2 input_dims, + int2 strides, + int2 paddings) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst); + + // Determine output index + uint idx = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y; + __global uchar *output_ptr = dst.ptr + idx; + + // Determine current input index + const int top_left_x = get_global_id(0) * strides.x - paddings.x; + const int top_left_y = get_global_id(1) * strides.y - paddings.y; + + // Linearize convolution elements + for(int d = 0; d < kernel_depth; ++d) + { + for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y) + { + for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x) + { + if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y) + { + *((__global DATA_TYPE *)output_ptr) = 0; + } + else + { + *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d))); + } + } + } + } + +#if defined HAS_BIAS + *((__global DATA_TYPE *)output_ptr) = 1; +#endif +} + +/** This kernel performs a reshaping of the output of the convolution layer. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F16, F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width The output tensor width + */ +__kernel void col2im( + IMAGE_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint width) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst); + + int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x; + __global uchar *tmp_out_ptr = dst.ptr + idx; + *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr)); +} + +/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as input. + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + */ +__kernel void im2col_reduced( + TENSOR3D_DECLARATION(src), + VECTOR_DECLARATION(dst), + uint width, uint height) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + const uint image_size = width * height; + + __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x; + + *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr); + +#if defined HAS_BIAS + // If it is the last thread in the 3 dimensional workgroup + if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)) + { + tmp_out_ptr += dst_stride_x; + *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1; + } +#endif +} diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl new file mode 100644 index 0000000000..96b9cff3eb --- /dev/null +++ b/src/core/CL/cl_kernels/convolution_rectangle.cl @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "convolution3x3.cl" +#include "convolution5x5.cl" +#include "convolution7x7.cl" +#include "convolution9x9.cl" +#include "helpers.h" + +#define MAT_INDEX(i) MAT##i + +#ifndef DATA_TYPE +#define DATA_TYPE short +#endif + +#ifndef COMPUTE_TYPE +#define COMPUTE_TYPE int +#endif + +#ifndef DATA_TYPE_OUT +#define DATA_TYPE_OUT uchar +#endif + +#ifndef DYNAMIC_MATRIX_CONVOLUTION + +/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders + * + * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n + * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void convolution_rectangle( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + short matrix_coeff[81] = + { + MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, + MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17, + MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26, + MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, + MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, + MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53, + MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62, + MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, + MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80 + }; + + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0; + + for(int i = 0; i < MATRIX_HEIGHT; i++) + { +#if MATRIX_WIDTH == 3 + pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3], + matrix_coeff[2 + i * 3]); +#endif + +#if MATRIX_WIDTH == 5 + pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5], + matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]); +#endif + +#if MATRIX_WIDTH == 7 + pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7], + matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7], + matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]); +#endif + +#if MATRIX_WIDTH == 9 + pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9], + matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9], + matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]); +#endif + } + + pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE; + + // Store the result as is in dst + vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr)); +} + +#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl new file mode 100644 index 0000000000..c8eaa95352 --- /dev/null +++ b/src/core/CL/cl_kernels/depth_convert.cl @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifdef SATURATE +#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type) +#else +#define CONVERT_DOWN(x, type) CONVERT(x, type) +#endif + +/** This function performs a down-scaling depth conversion. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32 + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] shift The integer shift amount value. Supported data types: S32 + */ +__kernel void convert_depth_down( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int shift) +{ + // Get pixels pointer + Image in = CONVERT_TO_IMAGE_STRUCT(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, 16) + in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr); + vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} + +/** This function performs a up-scaling depth conversion. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32 + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] shift The integer shift amount value. Supported data types: S32 + */ +__kernel void convert_depth_up( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int shift) +{ + // Get pixels pointer + Image in = CONVERT_TO_IMAGE_STRUCT(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl new file mode 100644 index 0000000000..0e810d2e7c --- /dev/null +++ b/src/core/CL/cl_kernels/derivative.cl @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This OpenCL kernel that computes the first-order derivative. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void derivative( + IMAGE_DECLARATION(src) +#ifdef GRAD_X + , + IMAGE_DECLARATION(dst_gx) +#endif +#ifdef GRAD_Y + , + IMAGE_DECLARATION(dst_gy) +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#ifdef GRAD_X + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + +#ifdef GRAD_X + short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0))); + short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0))); + vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1))); + short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1))); + vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr)); +#endif +} diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl new file mode 100644 index 0000000000..c62c701757 --- /dev/null +++ b/src/core/CL/cl_kernels/dilate.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function dilates an input image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void dilate( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 top = vload16(0, offset(&src, -1, -1)); + uchar16 middle = vload16(0, offset(&src, -1, 0)); + uchar16 bottom = vload16(0, offset(&src, -1, 1)); + + uchar16 tmp = max(top, max(middle, bottom)); + uchar8 out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789)); + + vstore8(out, 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl new file mode 100644 index 0000000000..6576f1827f --- /dev/null +++ b/src/core/CL/cl_kernels/erode.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function erodes an input image image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void erode( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uchar16 top = vload16(0, offset(&src, -1, -1)); + uchar16 middle = vload16(0, offset(&src, -1, 0)); + uchar16 bottom = vload16(0, offset(&src, -1, 1)); + + uchar16 tmp = min(top, min(middle, bottom)); + uchar8 out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789)); + + vstore8(out, 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl new file mode 100644 index 0000000000..470d14a7b0 --- /dev/null +++ b/src/core/CL/cl_kernels/fast_corners.cl @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "types.h" + +/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P. + * + * . . F 0 1 . . . + * . E . . . 2 . . + * D . . . . . 3 . + * C . . P . . 4 . + * B . . . . . 5 . + * . A . . . 6 . . + * . . 9 8 7 . . . + */ +constant int offsets_s[16][2] = +{ + { 0, -3 }, // 0 + { 1, -3 }, // 1 + { 2, -2 }, // 2 + { 3, -1 }, // 3 + { 3, 0 }, // 4 + { 3, 1 }, // 5 + { 2, 2 }, // 6 + { 1, 3 }, // 7 + { 0, 3 }, // 8 + { -1, 3 }, // 9 + { -2, 2 }, // A + { -3, 1 }, // B + { -3, 0 }, // C + { -3, -1 }, // D + { -2, -2 }, // E + { -1, -3 }, // F +}; + +/** Load a pixel and set the mask values. + * + * @param[in] ptr The pointer to the starting address of source image + * @param[in] a Index to indicate the position in the Bresenham circle + * @param[in] stride Stride of source image in x dimension + * @param[in] dark The left end of the threshold range + * @param[in] bright The right end of the threshold range + * @param[out] dark_mask The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark + * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright + * + */ +#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \ + { \ + unsigned char pixel; \ + pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]); \ + dark_mask |= (pixel < dark) << a; \ + bright_mask |= (pixel > bright) << a; \ + } + +/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark. + * + * @param[in] bright_mask The mask recording postions of bright pixels + * @param[in] dark_mask The mask recording postions of dark pixels + * @param[out] isCorner Indicate whether candidate pixel is corner + */ +#define CHECK_CORNER(bright_mask, dark_mask, isCorner) \ + { \ + for(int i = 0; i < 16; i++) \ + { \ + isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \ + isCorner |= ((dark_mask & 0x1FF) == 0x1FF); \ + if(isCorner) \ + { \ + break; \ + } \ + bright_mask >>= 1; \ + dark_mask >>= 1; \ + } \ + } + +/* Calculate pixel's strength */ +uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold) +{ + short a = threshold; + short b = 255; + while(b - a > 1) + { + uchar c = convert_uchar_sat((a + b) / 2); + unsigned int bright_mask = 0; + unsigned int dark_mask = 0; + + unsigned char p_bright = add_sat(candidate_pixel, c); + unsigned char p_dark = sub_sat(candidate_pixel, c); + + bool isCorner = 0; + + for(uint i = 0; i < 16; i++) + { + LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask) + } + + bright_mask |= (bright_mask << 16); + dark_mask |= (dark_mask << 16); + CHECK_CORNER(bright_mask, dark_mask, isCorner); + + if(isCorner) + { + a = convert_short(c); + } + else + { + b = convert_short(c); + } + } + return a; +} + +/** Fast corners implementation. Calculates and returns the strength of each pixel. + * + * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright + * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits + * from the LSB are set. + * + * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[out] output_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] output_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] threshold_value Threshold value. + * + */ +__kernel void fast_corners( + IMAGE_DECLARATION(input), + IMAGE_DECLARATION(output), + float threshold_value) +{ + Image in = CONVERT_TO_IMAGE_STRUCT(input); + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + const unsigned char threshold = (uchar)threshold_value; + + unsigned int bright_mask = 0; + unsigned int dark_mask = 0; + + unsigned char isCorner = 0; + + unsigned char p = *in.ptr; + unsigned char p_bright = add_sat(p, threshold); + unsigned char p_dark = sub_sat(p, threshold); + + LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + + if(((bright_mask | dark_mask) & 0x1111) == 0) + { + *out.ptr = 0; + return; + } + + LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) + + bright_mask |= (bright_mask << 16); + dark_mask |= (dark_mask << 16); + + CHECK_CORNER(bright_mask, dark_mask, isCorner) + + if(!isCorner) + { + *out.ptr = 0; + return; + } + +#ifndef USE_MAXSUPPRESSION + *out.ptr = 1; +#else + + *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold); +#endif +} + +/** Copy result to Keypoint buffer and count number of corners + * + * @param[in] input_ptr Pointer to the image with calculated strenghs. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] max_num_points The maximum number of keypoints the array can hold + * @param[out] offset The number of skipped pixels in x dimension + * @param[out] num_of_points Number of points found + * @param[out] out The keypoints found + * + */ +__kernel void copy_to_keypoint( + IMAGE_DECLARATION(input), + uint max_num_points, + uint offset, + __global uint *num_of_points, + __global Keypoint *out) +{ +#ifndef UPDATE_NUMBER + if(*num_of_points >= max_num_points) + { + return; + } +#endif + + Image in = CONVERT_TO_IMAGE_STRUCT(input); + + uchar value = *in.ptr; + + if(value > 0) + { + int id = atomic_inc(num_of_points); + if(id < max_num_points) + { + out[id].strength = value; + out[id].x = get_global_id(0) + offset; + out[id].y = get_global_id(1) + offset; + out[id].tracking_status = 1; + } + } +} diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl new file mode 100644 index 0000000000..df635869b1 --- /dev/null +++ b/src/core/CL/cl_kernels/fill_border.cl @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel. + * + * @attention The DATA_TYPE needs to be passed at the compile time. + * e.g. -DDATA_TYPE=int + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2 + * + * @param[in,out] buf_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos XY coordinate indicating the start point of the valid region + */ +__kernel void fill_image_borders_replicate( + IMAGE_DECLARATION(buf), + uint width, + uint height, + int2 start_pos) +{ + Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf); + + // Update pointer to point to the starting point of the valid region + buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x; + + const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT; + const int gid0 = get_global_id(0); + const int gidH = gid0 - total_width; + const int gidW = gid0 - BORDER_SIZE_LEFT; + + if(gidH >= 0) + { + // Handle left border + DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH); + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val; + } + // Handle right border + DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH); + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val; + } + } + else + { + // Get value for corners + int val_idx = gidW; + if(gidW < 0 || gidW > (width - 1)) + { + val_idx = gidW < 0 ? 0 : width - 1; + } + + // Handle top border + DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0); + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val; + } + // Handle bottom border + DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1); + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val; + } + } +} + +/** Fill N pixels of the padding edge of a single channel image with a constant value. + * + * @attention The DATA_TYPE needs to be passed at the compile time. + * e.g. -DDATA_TYPE=int + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2 + * + * @param[out] buf_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos XY coordinate indicating the start point of the valid region + * @param[in] constant_value Constant value to use to fill the edges + */ +__kernel void fill_image_borders_constant( + IMAGE_DECLARATION(buf), + uint width, + uint height, + int2 start_pos, + DATA_TYPE constant_value) +{ + Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf); + + // Update pointer to point to the starting point of the valid region + buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x; + + const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT; + const int gid0 = get_global_id(0); + const int gidH = gid0 - total_width; + const int gidW = gid0 - BORDER_SIZE_LEFT; + + if(gidH >= 0) + { + // Handle left border + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value; + } + // Handle right border + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value; + } + } + else + { + // Handle top border + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value; + } + // Handle bottom border + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value; + } + } +} diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl new file mode 100644 index 0000000000..618937f36d --- /dev/null +++ b/src/core/CL/cl_kernels/gaussian_pyramid.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction + * + * @note Each thread computes 8 pixels + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void gaussian1x5_sub_x( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values for the convolution (20 bytes needed) + uchar16 temp0 = vload16(0, src.ptr); + uchar4 temp1 = vload4(0, src.ptr + 16); + + // Convert to USHORT8 + ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE)); + ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF)); + ushort8 m_data = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0)); + ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1)); + ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02)); + + // Compute convolution along the X direction + ushort8 pixels = l2_data + r2_data; + pixels += l1_data * (ushort8)4; + pixels += m_data * (ushort8)6; + pixels += r1_data * (ushort8)4; + + // Store result + vstore8(pixels, 0, (__global ushort *)dst.ptr); +} + +/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction + * + * @note Each thread computes 8 pixels + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void gaussian5x1_sub_y( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0)); + ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1)); + ushort8 m_data = vload8(0, (__global ushort *)offset(&src, 0, 2)); + ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3)); + ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4)); + + // Compute convolution along the Y direction + ushort8 pixels = u2_data + d2_data; + pixels += u1_data * (ushort8)4; + pixels += m_data * (ushort8)6; + pixels += d1_data * (ushort8)4; + + // Scale result + pixels >>= (ushort8)8; + + // Store result + vstore8(convert_uchar8_sat(pixels), 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl new file mode 100644 index 0000000000..caf6e3ffd8 --- /dev/null +++ b/src/core/CL/cl_kernels/gemm.cl @@ -0,0 +1,1099 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + uint x = get_global_id(0); + uint y = get_global_id(1); + + /* Compute address for Matrix B - source */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); + + float4 b0 = vload4(0, (__global float *)src.ptr); + + vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes)); +} + +/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + uint x = get_global_id(0); + uint y = get_global_id(1); + + /* Compute address for Matrix B - source */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); + + half8 b0 = vload8(0, (__global half *)src.ptr); + + vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes)); +} + +/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + uint x = get_global_id(0); + uint y = get_global_id(1); + + /* Compute address for Matrix B - source */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); + + uchar16 b0 = vload16(0, (__global uchar *)src.ptr); + + vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes)); +} + +/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U32/S32/F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from Matrix A */ + float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0))); + float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1))); + float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2))); + float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3))); + + float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0); + vstore4(val0, 0, ((__global float *)dst.ptr) + 0); + + val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1); + vstore4(val0, 0, ((__global float *)dst.ptr) + 4); + + val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2); + vstore4(val0, 0, ((__global float *)dst.ptr) + 8); + + val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3); + vstore4(val0, 0, ((__global float *)dst.ptr) + 12); +} + +/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U16/S16/F16 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from Matrix A */ + half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0))); + half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1))); + half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2))); + half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3))); + + half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1)); + vstore8(val0, 0, ((__global half *)dst.ptr) + 0); + + val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3)); + vstore8(val0, 0, ((__global half *)dst.ptr) + 8); + + val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5)); + vstore8(val0, 0, ((__global half *)dst.ptr) + 16); + + val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7)); + vstore8(val0, 0, ((__global half *)dst.ptr) + 24); +} + +/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8/S8 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from Matrix A */ + uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0))); + uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1))); + uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2))); + uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3))); + + uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1), + (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3)); + vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0); + + val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5), + (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7)); + vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16); + + val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9), + (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB)); + vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32); + + val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD), + (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF)); + vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48); +} + +/** This kernel accumulates each row with the biases vector + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32 + * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as input. + * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gemm_accumulate_biases_f32( + IMAGE_DECLARATION(accum), + VECTOR_DECLARATION(biases)) +{ + Image accum = CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); + + float4 accum_value = vload4(0, (__global float *)accum.ptr); + float4 biases_value = vload4(0, (__global float *)biases.ptr); + accum_value = biases_value + accum_value; + + // Store result in the accummulate buffer + vstore4(accum_value, 0, (__global float *)accum.ptr); +} + +/** This kernel accumulates each row with the biases vector + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16 + * @param[in] accum_stride_x Stride of the accumulate tensor in X dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as input. + * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gemm_accumulate_biases_f16( + IMAGE_DECLARATION(accum), + VECTOR_DECLARATION(biases)) +{ + Image accum = CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); + + half8 accum_value = vload8(0, (__global half *)accum.ptr); + half8 biases_value = vload8(0, (__global half *)biases.ptr); + accum_value = biases_value + accum_value; + + // Store result in the accummulate buffer + vstore8(accum_value, 0, (__global half *)accum.ptr); +} + +#if(defined WIDTH_MATRIX_B) +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B + * + * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + * @param[in] a_offset Offset to be added to each element of the matrix A + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] c_offset Offset to be added to each element of the matrix C. + * @param[in] c_mult_int Multiplied with each element of the matrix C. + * @param[in] shift Number of bits to shift right the result. + */ +__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst), + int a_offset, + int b_offset, + int c_offset, + int c_mult_int, + int shift) +{ + /* src_addr.s0 = address of matrix A */ + /* src_addr.s1 = address of matrix B */ + + /* Compute address for matrix A and B */ + int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), + (src1_stride_y)); + + /* Add offset_first_element_in_bytes */ + src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + /* Compute end row address for matrix B */ + int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B; + + /* Reset accumulators */ + int16 c00 = 0.0f; + int16 c10 = 0.0f; + int16 c20 = 0.0f; + int16 c30 = 0.0f; + + for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0)); + int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); + + c00 += (int16)a0.s0 * b0; + c10 += (int16)a0.s1 * b0; + c20 += (int16)a0.s2 * b0; + c30 += (int16)a0.s3 * b0; + + int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16)); + + c00 += (int16)a0.s4 * b1; + c10 += (int16)a0.s5 * b1; + c20 += (int16)a0.s6 * b1; + c30 += (int16)a0.s7 * b1; + } + + for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0)); + int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); + + c00 += (int16)a0.s0 * b0; + c10 += (int16)a0.s1 * b0; + c20 += (int16)a0.s2 * b0; + c30 += (int16)a0.s3 * b0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Multiply by the weight of matrix product */ + c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift; + c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift; + c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift; + c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift; + + /* Store 4x16 block */ + vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0))); + vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1))); + vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2))); + vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3))); +} +#endif + +#if(defined WIDTH_MATRIX_B && defined ALPHA) +/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + /* src_addr.s0 = address of matrix A */ + /* src_addr.s1 = address of matrix B */ + + /* Compute address for matrix A and B */ + int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), + (src1_stride_y)); + + /* Add offset_first_element_in_bytes */ + src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + /* Divide by 4 in order to get the src_addr in unit of float */ + src_addr = src_addr >> 2; + + /* Compute end row address for matrix B */ + int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B; + + /* Reset accumulators */ + float4 c00 = 0.0f; + float4 c10 = 0.0f; + float4 c20 = 0.0f; + float4 c30 = 0.0f; + + for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); + float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + + c00 += (float4)a0.s0 * b0; + c10 += (float4)a0.s1 * b0; + c20 += (float4)a0.s2 * b0; + c30 += (float4)a0.s3 * b0; + + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4); + b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4); + + c00 += (float4)a0.s0 * b0; + c10 += (float4)a0.s1 * b0; + c20 += (float4)a0.s2 * b0; + c30 += (float4)a0.s3 * b0; + } + + for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); + float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + + c00 += (float4)a0.s0 * b0; + c10 += (float4)a0.s1 * b0; + c20 += (float4)a0.s2 * b0; + c30 += (float4)a0.s3 * b0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Multiply by the weight of matrix product */ + c00 = c00 * (float4)ALPHA; + c10 = c10 * (float4)ALPHA; + c20 = c20 * (float4)ALPHA; + c30 = c30 * (float4)ALPHA; + + /* Store 4x4 block */ + vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0))); + vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1))); + vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2))); + vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3))); +} + +/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes); + + // Compute end row address for matrix B + __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B; + + // Reset accumulators + float c00 = 0.0f; + float c01 = 0.0f; + float c02 = 0.0f; + float c03 = 0.0f; + float c10 = 0.0f; + float c11 = 0.0f; + float c12 = 0.0f; + float c13 = 0.0f; + float c20 = 0.0f; + float c21 = 0.0f; + float c22 = 0.0f; + float c23 = 0.0f; + float c30 = 0.0f; + float c31 = 0.0f; + float c32 = 0.0f; + float c33 = 0.0f; + + for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c00 = fma(a0.s0, b0.s0, c00); + c01 = fma(a0.s0, b0.s1, c01); + c02 = fma(a0.s0, b0.s2, c02); + c03 = fma(a0.s0, b0.s3, c03); + + c10 = fma(a0.s1, b0.s0, c10); + c11 = fma(a0.s1, b0.s1, c11); + c12 = fma(a0.s1, b0.s2, c12); + c13 = fma(a0.s1, b0.s3, c13); + + c20 = fma(a0.s2, b0.s0, c20); + c21 = fma(a0.s2, b0.s1, c21); + c22 = fma(a0.s2, b0.s2, c22); + c23 = fma(a0.s2, b0.s3, c23); + + c30 = fma(a0.s3, b0.s0, c30); + c31 = fma(a0.s3, b0.s1, c31); + c32 = fma(a0.s3, b0.s2, c32); + c33 = fma(a0.s3, b0.s3, c33); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 4); + b0 = vload4(0, src_addr_b + 4); + + c00 = fma(a0.s0, b0.s0, c00); + c01 = fma(a0.s0, b0.s1, c01); + c02 = fma(a0.s0, b0.s2, c02); + c03 = fma(a0.s0, b0.s3, c03); + + c10 = fma(a0.s1, b0.s0, c10); + c11 = fma(a0.s1, b0.s1, c11); + c12 = fma(a0.s1, b0.s2, c12); + c13 = fma(a0.s1, b0.s3, c13); + + c20 = fma(a0.s2, b0.s0, c20); + c21 = fma(a0.s2, b0.s1, c21); + c22 = fma(a0.s2, b0.s2, c22); + c23 = fma(a0.s2, b0.s3, c23); + + c30 = fma(a0.s3, b0.s0, c30); + c31 = fma(a0.s3, b0.s1, c31); + c32 = fma(a0.s3, b0.s2, c32); + c33 = fma(a0.s3, b0.s3, c33); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 8); + b0 = vload4(0, src_addr_b + 8); + + c00 = fma(a0.s0, b0.s0, c00); + c01 = fma(a0.s0, b0.s1, c01); + c02 = fma(a0.s0, b0.s2, c02); + c03 = fma(a0.s0, b0.s3, c03); + + c10 = fma(a0.s1, b0.s0, c10); + c11 = fma(a0.s1, b0.s1, c11); + c12 = fma(a0.s1, b0.s2, c12); + c13 = fma(a0.s1, b0.s3, c13); + + c20 = fma(a0.s2, b0.s0, c20); + c21 = fma(a0.s2, b0.s1, c21); + c22 = fma(a0.s2, b0.s2, c22); + c23 = fma(a0.s2, b0.s3, c23); + + c30 = fma(a0.s3, b0.s0, c30); + c31 = fma(a0.s3, b0.s1, c31); + c32 = fma(a0.s3, b0.s2, c32); + c33 = fma(a0.s3, b0.s3, c33); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 12); + b0 = vload4(0, src_addr_b + 12); + + c00 = fma(a0.s0, b0.s0, c00); + c01 = fma(a0.s0, b0.s1, c01); + c02 = fma(a0.s0, b0.s2, c02); + c03 = fma(a0.s0, b0.s3, c03); + + c10 = fma(a0.s1, b0.s0, c10); + c11 = fma(a0.s1, b0.s1, c11); + c12 = fma(a0.s1, b0.s2, c12); + c13 = fma(a0.s1, b0.s3, c13); + + c20 = fma(a0.s2, b0.s0, c20); + c21 = fma(a0.s2, b0.s1, c21); + c22 = fma(a0.s2, b0.s2, c22); + c23 = fma(a0.s2, b0.s3, c23); + + c30 = fma(a0.s3, b0.s0, c30); + c31 = fma(a0.s3, b0.s1, c31); + c32 = fma(a0.s3, b0.s2, c32); + c33 = fma(a0.s3, b0.s3, c33); + } + + for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c00 = fma(a0.s0, b0.s0, c00); + c01 = fma(a0.s0, b0.s1, c01); + c02 = fma(a0.s0, b0.s2, c02); + c03 = fma(a0.s0, b0.s3, c03); + + c10 = fma(a0.s1, b0.s0, c10); + c11 = fma(a0.s1, b0.s1, c11); + c12 = fma(a0.s1, b0.s2, c12); + c13 = fma(a0.s1, b0.s3, c13); + + c20 = fma(a0.s2, b0.s0, c20); + c21 = fma(a0.s2, b0.s1, c21); + c22 = fma(a0.s2, b0.s2, c22); + c23 = fma(a0.s2, b0.s3, c23); + + c30 = fma(a0.s3, b0.s0, c30); + c31 = fma(a0.s3, b0.s1, c31); + c32 = fma(a0.s3, b0.s2, c32); + c33 = fma(a0.s3, b0.s3, c33); + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Multiply by the weight of matrix product + c00 = c00 * ALPHA; + c01 = c01 * ALPHA; + c02 = c02 * ALPHA; + c03 = c03 * ALPHA; + c10 = c10 * ALPHA; + c11 = c11 * ALPHA; + c12 = c12 * ALPHA; + c13 = c13 * ALPHA; + c20 = c20 * ALPHA; + c21 = c21 * ALPHA; + c22 = c22 * ALPHA; + c23 = c23 * ALPHA; + c30 = c30 * ALPHA; + c31 = c31 * ALPHA; + c32 = c32 * ALPHA; + c33 = c33 * ALPHA; + + barrier(CLK_GLOBAL_MEM_FENCE); + + // Store 4x4 block + vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0))); + vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1))); + vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2))); + vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3))); +} + +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + /* src_addr.s0 = address of matrix A */ + /* src_addr.s1 = address of matrix B */ + + /* Compute address for matrix A and B */ + int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), + (src1_stride_y)); + + /* Add offset_first_element_in_bytes */ + src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + /* Divide by 2 in order to get the src_addr in unit of half */ + src_addr = src_addr >> 1; + + /* Compute end row address for matrix B */ + int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B; + + /* Reset accumulators */ + half8 c00 = 0.0f; + half8 c10 = 0.0f; + half8 c20 = 0.0f; + half8 c30 = 0.0f; + + for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); + half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + + c00 += (half8)a0.s0 * b0; + c10 += (half8)a0.s1 * b0; + c20 += (half8)a0.s2 * b0; + c30 += (half8)a0.s3 * b0; + + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4); + b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8); + + c00 += (half8)a0.s0 * b0; + c10 += (half8)a0.s1 * b0; + c20 += (half8)a0.s2 * b0; + c30 += (half8)a0.s3 * b0; + } + + for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); + half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + + c00 += (half8)a0.s0 * b0; + c10 += (half8)a0.s1 * b0; + c20 += (half8)a0.s2 * b0; + c30 += (half8)a0.s3 * b0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Multiply by the weight of matrix product */ + c00 = c00 * (half8)ALPHA; + c10 = c10 * (half8)ALPHA; + c20 = c20 * (half8)ALPHA; + c30 = c30 * (half8)ALPHA; + + /* Store 4x8 block */ + vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0))); + vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1))); + vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2))); + vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3))); +} + +#if(defined WIDTH_VECTOR_A) +/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) + * + * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA + * + * @attention The input vector A and matrix B must not be reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * 4; + + /* Compute the address for the vector A and matrix B */ + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + src_addr.s1 += idx * sizeof(float); + + int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float)); + + float4 acc = 0.0f; + + for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) + { + float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + acc += b0 * (float4)a0.s0; + acc += b1 * (float4)a0.s1; + } + + for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y)) + { + float a0 = *((__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + + acc += b0 * (float4)a0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Multiply by the weight of vector-matrix product */ + acc = acc * (float4)ALPHA; + + vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0))); +} + +/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) + * + * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA + * + * @attention The input vector A and matrix B must not be reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * 8; + + /* Compute the address for the vector A and matrix B */ + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + src_addr.s1 += idx * sizeof(half); + + int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half)); + + half8 acc = 0.0f; + + for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y)) + { + half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0)); + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y)); + half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y)); + half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y)); + half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y)); + + acc += b0 * (half8)a0.s0; + acc += b1 * (half8)a0.s1; + acc += b2 * (half8)a0.s2; + acc += b3 * (half8)a0.s3; + } + + for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y)) + { + half a0 = *((__global half *)(src0_ptr + src_addr.s0)); + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + + acc += b0 * (half8)a0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Multiply by the weight of vector-matrix product */ + acc = acc * (half8)ALPHA; + + vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0))); +} +#endif /* (defined WIDTH_VECTOR_A) */ +#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */ + +#if(defined BETA) +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @attention The beta's value need to be passed at compile time using -DBETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_ma_f32(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from A x B */ + float4 alpha_ab = vload4(0, (__global float *)dst.ptr); + + /* Load values from Matrix C */ + float4 c = vload4(0, (__global float *)src.ptr); + + /* Computes alpha * axb + beta * c */ + float4 out = alpha_ab + (float4)BETA * c; + + /* Store final result in axb matrix */ + vstore4(out, 0, (__global float *)dst.ptr); +} + +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_ma_f16(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from A x B */ + half8 alpha_ab = vload8(0, (__global half *)dst.ptr); + + /* Load values from Matrix C */ + half8 c = vload8(0, (__global half *)src.ptr); + + /* Computes alpha * axb + beta * c */ + half8 out = alpha_ab + (half8)BETA * c; + + /* Store final result in axb matrix */ + vstore8(out, 0, (__global half *)dst.ptr); +} +#endif /* (defined BETA) */ + +#if(defined WIDTH_VECTOR_A) +/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer + * + * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A + * + * @attention The input A and matrix B must not be reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes) + * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), + TENSOR3D_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * 4; + int idy = get_global_id(1); + + /* Compute the address for the vector A and matrix B */ + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy)); + src_addr.s1 += idx * sizeof(float); + + int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float)); + + float4 acc = 0.0f; + + for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) + { + float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + acc += b0 * (float4)a0.s0; + acc += b1 * (float4)a0.s1; + } + + for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y)) + { + float a0 = *((__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + + acc += b0 * (float4)a0; + } + + /* Compute destination address */ + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0))); +} +#endif /* (defined WIDTH_VECTOR_A) */ diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl new file mode 100644 index 0000000000..5320a064ed --- /dev/null +++ b/src/core/CL/cl_kernels/harris_corners.cl @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Function running harris score on 3x3 block size + * + * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. + * e.g. -DDATA_TYPE=short. + * + * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 + * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 + * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation + * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores + * @param[in] pow4_normalization_factor Normalization factor to apply harris score + */ +__kernel void harris_score_3x3( + IMAGE_DECLARATION(src_gx), + IMAGE_DECLARATION(src_gy), + IMAGE_DECLARATION(vc), + float sensitivity, + float strength_thresh, + float pow4_normalization_factor) +{ + Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); + Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); + Image vc = CONVERT_TO_IMAGE_STRUCT(vc); + + /* Gx^2, Gy^2 and Gx*Gy */ + float4 gx2 = (float4)0.0f; + float4 gy2 = (float4)0.0f; + float4 gxgy = (float4)0.0f; + + /* Row0 */ + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1)); + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1)); + + float4 l_gx = convert_float4(temp_gx.s0123); + float4 m_gx = convert_float4(temp_gx.s1234); + float4 r_gx = convert_float4(temp_gx.s2345); + + float4 l_gy = convert_float4(temp_gy.s0123); + float4 m_gy = convert_float4(temp_gy.s1234); + float4 r_gy = convert_float4(temp_gy.s2345); + + gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); + gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); + gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); + + /* Row1 */ + temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0)); + temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0)); + + l_gx = convert_float4(temp_gx.s0123); + m_gx = convert_float4(temp_gx.s1234); + r_gx = convert_float4(temp_gx.s2345); + + l_gy = convert_float4(temp_gy.s0123); + m_gy = convert_float4(temp_gy.s1234); + r_gy = convert_float4(temp_gy.s2345); + + gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); + gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); + gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); + + /* Row2 */ + temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1)); + temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1)); + + l_gx = convert_float4(temp_gx.s0123); + m_gx = convert_float4(temp_gx.s1234); + r_gx = convert_float4(temp_gx.s2345); + + l_gy = convert_float4(temp_gy.s0123); + m_gy = convert_float4(temp_gy.s1234); + r_gy = convert_float4(temp_gy.s2345); + + gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); + gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); + gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); + + /* Compute trace and determinant */ + float4 trace = gx2 + gy2; + float4 det = gx2 * gy2 - (gxgy * gxgy); + + /* Compute harris score */ + float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; + + mc = select(0.0f, mc, mc > (float4)strength_thresh); + + vstore4(mc, 0, (__global float *)vc.ptr); +} + +/** Function for calculating harris score 1x5. + * + * @param[in] src_gx Pointer to gx gradient image. + * @param[in] src_gy Pointer to gy gradient image. + * @param[in] row Relative row. + */ +inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row) +{ + float4 gx2 = 0.0f; + float4 gy2 = 0.0f; + float4 gxgy = 0.0f; + + /* Row */ + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row)); + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row)); + + float4 gx = convert_float4(temp_gx.s0123); + float4 gy = convert_float4(temp_gy.s0123); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx.s1234); + gy = convert_float4(temp_gy.s1234); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx.s2345); + gy = convert_float4(temp_gy.s2345); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx.s3456); + gy = convert_float4(temp_gy.s3456); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx.s4567); + gy = convert_float4(temp_gy.s4567); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + return (float16)(gx2, gy2, gxgy, (float4)0); +} + +/** Function running harris score on 5x5 block size + * + * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. + * e.g. -DDATA_TYPE=short. + * + * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 + * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 + * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation + * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores + * @param[in] pow4_normalization_factor Normalization factor to apply harris score + */ +__kernel void harris_score_5x5( + IMAGE_DECLARATION(src_gx), + IMAGE_DECLARATION(src_gy), + IMAGE_DECLARATION(vc), + float sensitivity, + float strength_thresh, + float pow4_normalization_factor) +{ + Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); + Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); + Image vc = CONVERT_TO_IMAGE_STRUCT(vc); + + /* Gx^2, Gy^2 and Gx*Gy */ + float16 res = (float16)0.0f; + + /* Compute row */ + for(int i = -2; i < 3; i++) + { + res += harris_score_1x5(&src_gx, &src_gy, i); + } + + float4 gx2 = res.s0123; + float4 gy2 = res.s4567; + float4 gxgy = res.s89AB; + + /* Compute trace and determinant */ + float4 trace = gx2 + gy2; + float4 det = gx2 * gy2 - (gxgy * gxgy); + + /* Compute harris score */ + float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; + + mc = select(0.0f, mc, mc > (float4)strength_thresh); + + vstore4(mc, 0, (__global float *)vc.ptr); +} + +/** Function for calculating harris score 1x7. + * + * @param[in] src_gx Pointer to gx gradient image. + * @param[in] src_gy Pointer to gy gradient image. + * @param[in] row Relative row. + */ +inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row) +{ + float4 gx2 = 0.0f; + float4 gy2 = 0.0f; + float4 gxgy = 0.0f; + + /* Row */ + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row)); + VEC_DATA_TYPE(DATA_TYPE, 8) + temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row)); + VEC_DATA_TYPE(DATA_TYPE, 2) + temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row)); + VEC_DATA_TYPE(DATA_TYPE, 2) + temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row)); + + float4 gx = convert_float4(temp_gx0.s0123); + float4 gy = convert_float4(temp_gy0.s0123); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx0.s1234); + gy = convert_float4(temp_gy0.s1234); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx0.s2345); + gy = convert_float4(temp_gy0.s2345); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx0.s3456); + gy = convert_float4(temp_gy0.s3456); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4(temp_gx0.s4567); + gy = convert_float4(temp_gy0.s4567); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0)); + gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0)); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01)); + gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01)); + gx2 += (gx * gx); + gy2 += (gy * gy); + gxgy += (gx * gy); + + return (float16)(gx2, gy2, gxgy, (float4)0); +} + +/** Function running harris score on 7x7 block size + * + * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. + * e.g. -DDATA_TYPE=short. + * + * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 + * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 + * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation + * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores + * @param[in] pow4_normalization_factor Normalization factor to apply harris score + */ +__kernel void harris_score_7x7( + IMAGE_DECLARATION(src_gx), + IMAGE_DECLARATION(src_gy), + IMAGE_DECLARATION(vc), + float sensitivity, + float strength_thresh, + float pow4_normalization_factor) +{ + Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); + Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); + Image vc = CONVERT_TO_IMAGE_STRUCT(vc); + + /* Gx^2, Gy^2 and Gx*Gy */ + float16 res = (float16)0.0f; + + /* Compute row */ + for(int i = -3; i < 4; i++) + { + res += harris_score_1x7(&src_gx, &src_gy, i); + } + + float4 gx2 = res.s0123; + float4 gy2 = res.s4567; + float4 gxgy = res.s89AB; + + /* Compute trace and determinant */ + float4 trace = gx2 + gy2; + float4 det = gx2 * gy2 - (gxgy * gxgy); + + /* Compute harris score */ + float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; + + mc = select(0.0f, mc, mc > (float4)strength_thresh); + + vstore4(mc, 0, (__global float *)vc.ptr); +} diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 0000000000..6db8ed567c --- /dev/null +++ b/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_x, \ + uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_x, \ + uint name##_step_x, \ + uint name##_stride_y, \ + uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_x, \ + uint name##_step_x, \ + uint name##_stride_y, \ + uint name##_step_y, \ + uint name##_stride_z, \ + uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ + name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * + * @return An image object + */ +Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) +{ + Vector vector = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * + * @return An image object + */ +Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y + }; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 3D tensor object + */ +Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Tensor3D tensor = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z + }; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +__global inline const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +__global inline uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting postion of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +#endif // _HELPER_H diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl new file mode 100644 index 0000000000..a652b28e6a --- /dev/null +++ b/src/core/CL/cl_kernels/histogram.cl @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define VATOMIC_INC16(histogram, win_pos) \ + { \ + atomic_inc(histogram + win_pos.s0); \ + atomic_inc(histogram + win_pos.s1); \ + atomic_inc(histogram + win_pos.s2); \ + atomic_inc(histogram + win_pos.s3); \ + atomic_inc(histogram + win_pos.s4); \ + atomic_inc(histogram + win_pos.s5); \ + atomic_inc(histogram + win_pos.s6); \ + atomic_inc(histogram + win_pos.s7); \ + atomic_inc(histogram + win_pos.s8); \ + atomic_inc(histogram + win_pos.s9); \ + atomic_inc(histogram + win_pos.sa); \ + atomic_inc(histogram + win_pos.sb); \ + atomic_inc(histogram + win_pos.sc); \ + atomic_inc(histogram + win_pos.sd); \ + atomic_inc(histogram + win_pos.se); \ + atomic_inc(histogram + win_pos.sf); \ + } + +/** Calculate the histogram of an 8 bit grayscale image. + * + * Each thread will process 16 pixels and use one local atomic operation per pixel. + * When all work items in a work group are done the resulting local histograms are + * added to the global histogram using global atomics. + * + * @note The input image is represented as a two-dimensional array of type uchar. + * The output is represented as a one-dimensional uint array of length of num_bins + * + * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 + * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 + * @param[out] num_bins The number of bins + * @param[out] offset The start of values to use (inclusive) + * @param[out] range The range of a bin + * @param[out] offrange The maximum value (exclusive) + */ +__kernel void hist_local_kernel(IMAGE_DECLARATION(input), + __local uint *histogram_local, + __global uint *restrict histogram, + uint num_bins, + uint offset, + uint range, + uint offrange) +{ + Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); + uint local_id_x = get_local_id(0); + + uint local_x_size = get_local_size(0); + + if(num_bins > local_x_size) + { + for(int i = local_id_x; i < num_bins; i += local_x_size) + { + histogram_local[i] = 0; + } + } + else + { + if(local_id_x <= num_bins) + { + histogram_local[local_id_x] = 0; + } + } + + uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); + + uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange)); + + barrier(CLK_LOCAL_MEM_FENCE); + VATOMIC_INC16(histogram_local, win_pos); + barrier(CLK_LOCAL_MEM_FENCE); + + if(num_bins > local_x_size) + { + for(int i = local_id_x; i < num_bins; i += local_x_size) + { + atomic_add(histogram + i, histogram_local[i]); + } + } + else + { + if(local_id_x <= num_bins) + { + atomic_add(histogram + local_id_x, histogram_local[local_id_x]); + } + } +} + +/** Calculate the histogram of an 8 bit grayscale image's border. + * + * Each thread will process one pixel using global atomic. + * When all work items in a work group are done the resulting local histograms are + * added to the global histogram using global atomics. + * + * @note The input image is represented as a two-dimensional array of type uchar. + * The output is represented as a one-dimensional uint array of length of num_bins + * + * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 + * @param[out] num_bins The number of bins + * @param[out] offset The start of values to use (inclusive) + * @param[out] range The range of a bin + * @param[out] offrange The maximum value (exclusive) + */ +__kernel void hist_border_kernel(IMAGE_DECLARATION(input), + __global uint *restrict histogram, + uint num_bins, + uint offset, + uint range, + uint offrange) +{ + Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); + + uint val = (uint)(*input_buffer.ptr); + + uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0; + + if(val >= offset && (val < offrange)) + { + atomic_inc(histogram + win_pos); + } +} + +/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1. + * + * Each thread will process 16 pixels and use one local atomic operation per pixel. + * When all work items in a work group are done the resulting local histograms are + * added to the global histogram using global atomics. + * + * @note The input image is represented as a two-dimensional array of type uchar. + * The output is represented as a one-dimensional uint array of 256 elements + * + * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 + * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 + */ +__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input), + __local uint *histogram_local, + __global uint *restrict histogram) +{ + Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); + + uint local_index = get_local_id(0); + uint local_x_size = get_local_size(0); + + for(int i = local_index; i < 256; i += local_x_size) + { + histogram_local[i] = 0; + } + + uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); + + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_inc(histogram_local + vals.s0); + atomic_inc(histogram_local + vals.s1); + atomic_inc(histogram_local + vals.s2); + atomic_inc(histogram_local + vals.s3); + atomic_inc(histogram_local + vals.s4); + atomic_inc(histogram_local + vals.s5); + atomic_inc(histogram_local + vals.s6); + atomic_inc(histogram_local + vals.s7); + atomic_inc(histogram_local + vals.s8); + atomic_inc(histogram_local + vals.s9); + atomic_inc(histogram_local + vals.sa); + atomic_inc(histogram_local + vals.sb); + atomic_inc(histogram_local + vals.sc); + atomic_inc(histogram_local + vals.sd); + atomic_inc(histogram_local + vals.se); + atomic_inc(histogram_local + vals.sf); + + barrier(CLK_LOCAL_MEM_FENCE); + + for(int i = local_index; i < 256; i += local_x_size) + { + atomic_add(histogram + i, histogram_local[i]); + } +} + +/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1. + * + * Each thread will process one pixel using global atomic. + * When all work items in a work group are done the resulting local histograms are + * added to the global histogram using global atomics. + * + * @note The input image is represented as a two-dimensional array of type uchar. + * The output is represented as a one-dimensional uint array of 256 + * + * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 + */ +__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input), + __global uint *restrict histogram) +{ + Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); + atomic_inc(histogram + *input_buffer.ptr); +} diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl new file mode 100644 index 0000000000..31dd57b767 --- /dev/null +++ b/src/core/CL/cl_kernels/hog.cl @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "types.h" + +#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE) + +/** This OpenCL kernel computes the HOG orientation binning + * + * @attention The following variables must be passed at compile time: + * + * -# -DCELL_WIDTH = Width of the cell + * -# -DCELL_HEIGHT = height of the cell + * -# -DNUM_BINS = Number of bins for each cell + * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG + * + * @note Each work-item computes a single cell + * + * @param[in] mag_ptr Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16 + * @param[in] mag_stride_x Stride of the magnitude image in X dimension (in bytes) + * @param[in] mag_step_x mag_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mag_stride_y Stride of the magnitude image in Y dimension (in bytes) + * @param[in] mag_step_y mag_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] mag_offset_first_element_in_bytes The offset of the first element in the magnitude image + * @param[in] phase_ptr Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8 + * @param[in] phase_stride_x Stride of the phase image in X dimension (in bytes) + * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] phase_stride_y Stride of the the phase image in Y dimension (in bytes) + * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the the phase image + * @param[out] dst_ptr Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag), + IMAGE_DECLARATION(phase), + IMAGE_DECLARATION(dst)) +{ + float bins[NUM_BINS] = { 0 }; + + // Compute address for the magnitude and phase images + Image mag = CONVERT_TO_IMAGE_STRUCT(mag); + Image phase = CONVERT_TO_IMAGE_STRUCT(phase); + + __global uchar *mag_row_ptr = mag.ptr; + __global uchar *phase_row_ptr = phase.ptr; + + for(int yc = 0; yc < CELL_HEIGHT; ++yc) + { + int xc = 0; + for(; xc <= (CELL_WIDTH - 4); xc += 4) + { + // Load magnitude and phase values + const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc)); + float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc)); + + // Scale phase: phase * scale + 0.5f + phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE; + + // Compute histogram index. + int4 hidx_s32 = convert_int4(phase_f32); + + // Compute magnitude weights (w0 and w1) + const float4 hidx_f32 = convert_float4(hidx_s32); + + // w1 = phase_f32 - hidx_s32 + const float4 w1_f32 = phase_f32 - hidx_f32; + + // w0 = 1.0 - w1 + const float4 w0_f32 = (float4)1.0f - w1_f32; + + // Calculate the weights for splitting vote + const float4 mag_w0_f32 = mag_f32 * w0_f32; + const float4 mag_w1_f32 = mag_f32 * w1_f32; + + // Weighted vote between 2 bins + + // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 + hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); + + // Bin 0 + bins[hidx_s32.s0] += mag_w0_f32.s0; + bins[hidx_s32.s1] += mag_w0_f32.s1; + bins[hidx_s32.s2] += mag_w0_f32.s2; + bins[hidx_s32.s3] += mag_w0_f32.s3; + + hidx_s32 += (int4)1; + + // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 + hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); + + // Bin1 + bins[hidx_s32.s0] += mag_w1_f32.s0; + bins[hidx_s32.s1] += mag_w1_f32.s1; + bins[hidx_s32.s2] += mag_w1_f32.s2; + bins[hidx_s32.s3] += mag_w1_f32.s3; + } + + // Left over computation + for(; xc < CELL_WIDTH; xc++) + { + const float mag_value = *((__global short *)mag_row_ptr + xc); + const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f; + const float w1 = phase_value - floor(phase_value); + + // The quantised phase is the histogram index [0, NUM_BINS - 1] + // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0 + const uint hidx = (uint)(phase_value) % NUM_BINS; + + // Weighted vote between 2 bins + bins[hidx] += mag_value * (1.0f - w1); + bins[(hidx + 1) % NUM_BINS] += mag_value * w1; + } + + // Point to the next row of magnitude and phase images + mag_row_ptr += mag_stride_y; + phase_row_ptr += phase_stride_y; + } + + // Compute address for the destination image + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Store the local HOG in the global memory + int xc = 0; + for(; xc <= (NUM_BINS - 4); xc += 4) + { + float4 values = vload4(0, bins + xc); + + vstore4(values, 0, ((__global float *)dst.ptr) + xc); + } + + // Left over stores + for(; xc < NUM_BINS; ++xc) + { + ((__global float *)dst.ptr)[xc] = bins[xc]; + } +} +#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE) + +#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD) + +#ifndef L2_NORM +#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel +#endif + +#ifndef L2HYS_NORM +#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel +#endif + +#ifndef L1_NORM +#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel +#endif + +/** This OpenCL kernel computes the HOG block normalization + * + * @attention The following variables must be passed at compile time: + * + * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block + * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction + * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block + * -# -DHOG_NORM_TYPE = Normalization type + * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method + * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM + * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM + * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM + * + * @note Each work-item computes a single block + * + * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void hog_block_normalization(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + float sum = 0.0f; + float4 sum_f32 = (float4)(0.0f); + + // Compute address for the source and destination tensor + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc) + { + const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y); + + int xc = 0; + for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16) + { + const float4 val0 = vload4(0, hist_ptr + xc + 0); + const float4 val1 = vload4(0, hist_ptr + xc + 4); + const float4 val2 = vload4(0, hist_ptr + xc + 8); + const float4 val3 = vload4(0, hist_ptr + xc + 12); + +#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) + // Compute val^2 for L2_NORM or L2HYS_NORM + sum_f32 += val0 * val0; + sum_f32 += val1 * val1; + sum_f32 += val2 * val2; + sum_f32 += val3 * val3; +#else + // Compute |val| for L1_NORM + sum_f32 += fabs(val0); + sum_f32 += fabs(val1); + sum_f32 += fabs(val2); + sum_f32 += fabs(val3); +#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) + + // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization. + // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values + // will be accessed consecutively + vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X); + vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X); + vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X); + vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X); + } + + // Compute left over + for(; xc < NUM_BINS_PER_BLOCK_X; ++xc) + { + const float val = hist_ptr[xc]; + +#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) + sum += val * val; +#else + sum += fabs(val); +#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) + + ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val; + } + } + + sum += dot(sum_f32, (float4)1.0f); + + float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f); + +#if(HOG_NORM_TYPE == L2HYS_NORM) + // Reset sum + sum_f32 = (float4)0.0f; + sum = 0.0f; + + int k = 0; + for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16) + { + float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0); + float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4); + float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8); + float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12); + + // Scale val + val0 = val0 * (float4)scale; + val1 = val1 * (float4)scale; + val2 = val2 * (float4)scale; + val3 = val3 * (float4)scale; + + // Clip val if over _threshold_l2hys + val0 = fmin(val0, (float4)L2_HYST_THRESHOLD); + val1 = fmin(val1, (float4)L2_HYST_THRESHOLD); + val2 = fmin(val2, (float4)L2_HYST_THRESHOLD); + val3 = fmin(val3, (float4)L2_HYST_THRESHOLD); + + // Compute val^2 + sum_f32 += val0 * val0; + sum_f32 += val1 * val1; + sum_f32 += val2 * val2; + sum_f32 += val3 * val3; + + vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0); + vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4); + vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8); + vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12); + } + + // Compute left over + for(; k < NUM_BINS_PER_BLOCK; ++k) + { + float val = ((__global float *)dst.ptr)[k] * scale; + + // Clip scaled input_value if over L2_HYST_THRESHOLD + val = fmin(val, (float)L2_HYST_THRESHOLD); + + sum += val * val; + + ((__global float *)dst.ptr)[k] = val; + } + + sum += dot(sum_f32, (float4)1.0f); + + // We use the same constants of OpenCV + scale = 1.0f / (sqrt(sum) + 1e-3f); + +#endif // (HOG_NORM_TYPE == L2HYS_NORM) + + int i = 0; + for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16) + { + float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0); + float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4); + float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8); + float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12); + + // Multiply val by the normalization scale factor + val0 = val0 * (float4)scale; + val1 = val1 * (float4)scale; + val2 = val2 * (float4)scale; + val3 = val3 * (float4)scale; + + vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0); + vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4); + vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8); + vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12); + } + + for(; i < NUM_BINS_PER_BLOCK; ++i) + { + ((__global float *)dst.ptr)[i] *= scale; + } +} +#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD) + +#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT) + +/** This OpenCL kernel computes the HOG detector using linear SVM + * + * @attention The following variables must be passed at compile time: + * + * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction + * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction + * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane + * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array + * -# -DIDX_CLASS = Index of the class to detect + * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction + * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction + * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window + * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window + * + * @note Each work-item computes a single detection window + * + * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] hog_descriptor Pointer to HOG descriptor. Supported data types: F32 + * @param[out] dst Pointer to DetectionWindow array + * @param[out] num_detection_windows Number of objects detected + */ +__kernel void hog_detector(IMAGE_DECLARATION(src), + __global float *hog_descriptor, + __global DetectionWindow *dst, + __global uint *num_detection_windows) +{ + // Check if the DetectionWindow array is full + if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS) + { + return; + } + + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + const int src_step_y_f32 = src_stride_y / sizeof(float); + + // Init score_f32 with 0 + float4 score_f32 = (float4)0.0f; + + // Init score with 0 + float score = 0.0f; + + __global float *src_row_ptr = (__global float *)src.ptr; + + // Compute Linear SVM + for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32) + { + int xb = 0; + + const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X; + + for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8) + { + // Load descriptor values + float4 a0_f32 = vload4(0, src_row_ptr + xb + 0); + float4 a1_f32 = vload4(0, src_row_ptr + xb + 4); + + float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y); + float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y); + + // Multiply accumulate + score_f32 += a0_f32 * b0_f32; + score_f32 += a1_f32 * b1_f32; + } + + for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb) + { + const float a = src_row_ptr[xb]; + const float b = hog_descriptor[xb + offset_y]; + + score += a * b; + } + } + + score += dot(score_f32, (float4)1.0f); + + // Add the bias. The bias is located at the position (descriptor_size() - 1) + // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y + score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y]; + + if(score > (float)THRESHOLD) + { + int id = atomic_inc(num_detection_windows); + if(id < MAX_NUM_DETECTION_WINDOWS) + { + dst[id].x = get_global_id(0) * BLOCK_STRIDE_WIDTH; + dst[id].y = get_global_id(1) * BLOCK_STRIDE_HEIGHT; + dst[id].width = DETECTION_WINDOW_WIDTH; + dst[id].height = DETECTION_WINDOW_HEIGHT; + dst[id].idx_class = IDX_CLASS; + dst[id].score = score; + } + } +} +#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ... diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl new file mode 100644 index 0000000000..970e04e150 --- /dev/null +++ b/src/core/CL/cl_kernels/integral_image.cl @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function computes the horizontal integral of the image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void integral_horizontal( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uint prev = 0; + + for(uint j = 0; j < src_step_x; j += 16) + { + barrier(CLK_GLOBAL_MEM_FENCE); + uint16 res = convert_uint16(vload16(0, offset(&src, j, 0))); + res.s0 += prev; + res.s1 += res.s0; + res.s2 += res.s1; + res.s3 += res.s2; + res.s4 += res.s3; + res.s5 += res.s4; + res.s6 += res.s5; + res.s7 += res.s6; + res.s8 += res.s7; + res.s9 += res.s8; + res.sA += res.s9; + res.sB += res.sA; + res.sC += res.sB; + res.sD += res.sC; + res.sE += res.sD; + res.sF += res.sE; + prev = res.sF; + vstore16(res, 0, (__global uint *)offset(&dst, j, 0)); + } +} + +/** This function computes the vertical integral of the image. + * + * @param[in,out] src_ptr Pointer to the source image. Supported data types: U32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] height Image height. + */ +__kernel void integral_vertical( + IMAGE_DECLARATION(src), + uint height) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0)); + for(uint j = 1; j < height; ++j) + { + barrier(CLK_GLOBAL_MEM_FENCE); + uint8 res = vload8(0, (__global uint *)offset(&src, 0, j)); + res += prev; + vstore8(res, 0, (__global uint *)offset(&src, 0, j)); + prev = res; + } +} diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl new file mode 100644 index 0000000000..c4b0df8de9 --- /dev/null +++ b/src/core/CL/cl_kernels/magnitude_phase.cl @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Calculates L1 normalization between two inputs. + * + * @param[in] a First input. Supported data types: S16, S32 + * @param[in] b Second input. Supported data types: S16, S32 + * + * @return L1 normalization magnitude result. Supported data types: S16, S32 + */ +inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) +{ + return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16)); +} + +/** Calculates L2 normalization between two inputs. + * + * @param[in] a First input. Supported data types: S16, S32 + * @param[in] b Second input. Supported data types: S16, S32 + * + * @return L2 normalization magnitude result. Supported data types: S16, S32 + */ +inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b) +{ + return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f), + VEC_DATA_TYPE(DATA_TYPE, 16)); +} + +/** Calculates unsigned phase between two inputs. + * + * @param[in] a First input. Supported data types: S16, S32 + * @param[in] b Second input. Supported data types: S16, S32 + * + * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8 + */ +inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) +{ + float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f; + angle_deg_f32 = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f); + return convert_uchar16(angle_deg_f32); +} + +/** Calculates signed phase between two inputs. + * + * @param[in] a First input. Supported data types: S16, S32 + * @param[in] b Second input. Supported data types: S16, S32 + * + * @return Signed phase mapped in the interval [0, 256). Supported data types: U8 + */ +inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) +{ + float16 arct = atan2pi(convert_float16(b), convert_float16(a)); + arct = select(arct, arct + 2, arct < 0.0f); + + return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & 0xFFu); +} + +#if(1 == MAGNITUDE) +#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y)) +#elif(2 == MAGNITUDE) +#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y)) +#else +#define MAGNITUDE_OP(x, y) +#endif + +#if(1 == PHASE) +#define PHASE_OP(x, y) phase_unsigned((x), (y)) +#elif(2 == PHASE) +#define PHASE_OP(x, y) phase_signed((x), (y)) +#else +#define PHASE_OP(x, y) +#endif + +/** Calculate the magnitude and phase of given the gradients of an image. + * + * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2). + * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256). + * + * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1 + * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1 + * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int + * + * @param[in] gx_ptr Pointer to the first source image (gradient X). Supported data types: S16, S32 + * @param[in] gx_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] gx_step_x gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gx_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] gx_step_y gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] gx_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] gy_ptr Pointer to the second source image (gradient Y) . Supported data types: S16, S32 + * @param[in] gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] gy_step_x gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] gy_step_y gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] magnitude_ptr Pointer to the magnitude destination image. Supported data types: S16, S32 + * @param[in] magnitude_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] magnitude_step_x magnitude_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] magnitude_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] magnitude_step_y magnitude_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] magnitude_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] phase_ptr Pointer to the phase destination image. Supported data types: U8 + * @param[in] phase_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] phase_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the destination image + * */ +__kernel void magnitude_phase( + IMAGE_DECLARATION(gx), + IMAGE_DECLARATION(gy) +#ifdef MAGNITUDE + , + IMAGE_DECLARATION(magnitude) +#endif +#ifdef PHASE + , + IMAGE_DECLARATION(phase) +#endif +) +{ + // Get pixels pointer + Image gx = CONVERT_TO_IMAGE_STRUCT(gx); + Image gy = CONVERT_TO_IMAGE_STRUCT(gy); + + // Load values + VEC_DATA_TYPE(DATA_TYPE, 16) + in_a = vload16(0, (__global DATA_TYPE *)gx.ptr); + VEC_DATA_TYPE(DATA_TYPE, 16) + in_b = vload16(0, (__global DATA_TYPE *)gy.ptr); + + // Calculate and store the results +#ifdef MAGNITUDE + Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude); + vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr); +#endif +#ifdef PHASE + Image phase = CONVERT_TO_IMAGE_STRUCT(phase); + vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr); +#endif +} diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl new file mode 100644 index 0000000000..50b8312548 --- /dev/null +++ b/src/core/CL/cl_kernels/mean_stddev.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +/** This function calculates the sum and sum of squares of a given input image. + * + * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] height Height of the input image + * @param[out] global_sum Global sum of all elements + * @param[out] global_sum_sq Global sum of squares of all elements + */ +__kernel void mean_stddev_accumulate( + IMAGE_DECLARATION(src), + uint height, + __global ulong *global_sum +#if defined STDDEV + , + __global ulong *global_sum_sq +#endif +) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + uint8 tmp_sum = 0; +#if defined STDDEV + uint8 tmp_sum_sq = 0; +#endif + // Calculate partial sum + for(int i = 0; i < height; i++) + { + // Load data + uint8 data = convert_uint8(vload8(0, offset(&src, 0, i))); + + tmp_sum += data; +#if defined STDDEV + tmp_sum_sq += data * data; +#endif + } + // Perform reduction + tmp_sum.s0123 += tmp_sum.s4567; + tmp_sum.s01 += tmp_sum.s23; + atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1); + +#if defined STDDEV + tmp_sum_sq.s0123 += tmp_sum_sq.s4567; + tmp_sum_sq.s01 += tmp_sum_sq.s23; + atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1); +#endif +} + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl new file mode 100644 index 0000000000..799b1e8c3b --- /dev/null +++ b/src/core/CL/cl_kernels/minmaxloc.cl @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "types.h" + +#ifndef DATA_TYPE_MIN +#define DATA_TYPE_MIN 0x0 +#endif + +#ifndef DATA_TYPE_MAX +#define DATA_TYPE_MAX 0xFF +#endif + +__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN); +__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX); +__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +/** This function identifies the min and maximum value of an input image. + * + * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE. + * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively. + * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 + * @param[in] width Input image width + */ +__kernel void minmax( + IMAGE_DECLARATION(src), + __global int *min_max, + uint width) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + // Initialize local minimum and local maximum + VEC_DATA_TYPE(DATA_TYPE, 16) + local_min = type_max; + VEC_DATA_TYPE(DATA_TYPE, 16) + local_max = type_min; + + // Calculate min/max of row + uint width4 = width >> 4; + for(uint i = 0; i < width4; i++) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0)); + local_min = min(data, local_min); + local_max = max(data, local_max); + } + +#ifdef NON_MULTIPLE_OF_16 + // Handle non multiple of 16 + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0)); + VEC_DATA_TYPE(DATA_TYPE, 16) + widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16)); + local_max = max(local_max, select(type_min, data, widx)); + local_min = min(local_min, select(type_max, data, widx)); +#endif + + // Perform min/max reduction + local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF); + local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF); + + local_min.s0123 = min(local_min.s0123, local_min.s4567); + local_max.s0123 = max(local_max.s0123, local_max.s4567); + + local_min.s01 = min(local_min.s01, local_min.s23); + local_max.s01 = max(local_max.s01, local_max.s23); + + local_min.s0 = min(local_min.s0, local_min.s1); + local_max.s0 = max(local_max.s0, local_max.s1); + + // Update global min/max + atomic_min(&min_max[0], local_min.s0); + atomic_max(&min_max[1], local_max.s0); +} + +/** This function counts the min and max occurrences in an image and tags their position. + * + * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values. + * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 + * @param[out] min_max_count Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1 + * @param[out] min_loc Array that holds the location of the minimum value occurrences + * @param[in] max_min_loc_count The maximum number of min value occurrences coordinates the array can hold + * @param[out] max_loc Array that holds the location of the maximum value occurrences + * @param[in] max_max_loc_count The maximum number of max value occurrences coordinates the array can hold + */ +__kernel void minmaxloc( + IMAGE_DECLARATION(src), + __global int *min_max, + __global uint *min_max_count +#if defined LOCATE_MIN + , + __global Coordinates2D *min_loc, uint max_min_loc_count +#endif +#if defined LOCATE_MAX + , + __global Coordinates2D *max_loc, uint max_max_loc_count +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + DATA_TYPE value = *((__global DATA_TYPE *)src.ptr); +#if defined COUNT_MIN_MAX + if(value == min_max[0]) + { + uint idx = atomic_inc(&min_max_count[0]); +#if defined LOCATE_MIN + if(idx < max_min_loc_count) + { + min_loc[idx].x = get_global_id(0); + min_loc[idx].y = get_global_id(1); + } +#endif + } + if(value == min_max[1]) + { + uint idx = atomic_inc(&min_max_count[1]); +#if defined LOCATE_MAX + if(idx < max_max_loc_count) + { + max_loc[idx].x = get_global_id(0); + max_loc[idx].y = get_global_id(1); + } +#endif + } +#endif +} diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl new file mode 100644 index 0000000000..f860c96bb8 --- /dev/null +++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "non_linear_filter_helpers.h" + +/** This function applies a non linear filter on a 3x3 box basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_box3x3( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar16 top = vload16(0, offset(&src, -1, -1)); + uchar16 middle = vload16(0, offset(&src, -1, 0)); + uchar16 bottom = vload16(0, offset(&src, -1, 1)); + + // Apply respective filter +#if defined MIN + uchar16 tmp = min(top, min(middle, bottom)); + uchar8 out = row_reduce_min_3(tmp); +#elif defined MAX + uchar16 tmp = max(top, max(middle, bottom)); + uchar8 out = row_reduce_max_3(tmp); +#elif defined MEDIAN + uchar8 p0 = top.s01234567; + uchar8 p1 = top.s12345678; + uchar8 p2 = top.s23456789; + uchar8 p3 = middle.s01234567; + uchar8 p4 = middle.s12345678; + uchar8 p5 = middle.s23456789; + uchar8 p6 = bottom.s01234567; + uchar8 p7 = bottom.s12345678; + uchar8 p8 = bottom.s23456789; + uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} + +/** This function applies a non linear filter on a 3x3 cross basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_cross3x3( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar8 top = vload8(0, offset(&src, 0, -1)); + uchar16 middle = vload16(0, offset(&src, -1, 0)); + uchar8 bottom = vload8(0, offset(&src, 0, 1)); + + // Apply respective filter +#if defined MIN + uchar8 tmp_middle = row_reduce_min_3(middle); + uchar8 out = min(tmp_middle, min(top, bottom)); +#elif defined MAX + uchar8 tmp_middle = row_reduce_max_3(middle); + uchar8 out = max(tmp_middle, max(top, bottom)); +#elif defined MEDIAN + uchar8 p0 = top.s01234567; + uchar8 p1 = middle.s01234567; + uchar8 p2 = middle.s12345678; + uchar8 p3 = middle.s23456789; + uchar8 p4 = bottom.s01234567; + uchar8 out = sort5(p0, p1, p2, p3, p4); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} + +/** This function applies a non linear filter on a 3x3 disk basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_disk3x3( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar16 top = vload16(0, offset(&src, -1, -1)); + uchar16 middle = vload16(0, offset(&src, -1, 0)); + uchar16 bottom = vload16(0, offset(&src, -1, 1)); + + // Apply respective filter +#if defined MIN + uchar16 tmp = min(top, min(middle, bottom)); + uchar8 out = row_reduce_min_3(tmp); +#elif defined MAX + uchar16 tmp = max(top, max(middle, bottom)); + uchar8 out = row_reduce_max_3(tmp); +#elif defined MEDIAN + uchar8 p0 = top.s01234567; + uchar8 p1 = top.s12345678; + uchar8 p2 = top.s23456789; + uchar8 p3 = middle.s01234567; + uchar8 p4 = middle.s12345678; + uchar8 p5 = middle.s23456789; + uchar8 p6 = bottom.s01234567; + uchar8 p7 = bottom.s12345678; + uchar8 p8 = bottom.s23456789; + uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl new file mode 100644 index 0000000000..d9ae95fd2d --- /dev/null +++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "non_linear_filter_helpers.h" + +// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html + +/** Sorting network to sort 8 disks of diameter 5 and return their median. + * + * @param[in] top2 Values of elements two rows above. + * @param[in] top Values of elements one row above. + * @param[in] middle Values of middle elements. + * @param[in] bottom Values of elements one row below. + * @param[in] bottom2 Values of elements two rows below. + * + * @return Median values for 8 elements. + */ +inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) +{ + uchar8 p0 = top2.s01234567; + uchar8 p1 = top2.s12345678; + uchar8 p2 = top2.s23456789; + uchar8 p3 = top.s01234567; + uchar8 p4 = top.s12345678; + uchar8 p5 = top.s23456789; + uchar8 p6 = top.s3456789A; + uchar8 p7 = top.s456789AB; + uchar8 p8 = middle.s01234567; + uchar8 p9 = middle.s12345678; + uchar8 p10 = middle.s23456789; + uchar8 p11 = middle.s3456789A; + uchar8 p12 = middle.s456789AB; + uchar8 p13 = bottom.s01234567; + uchar8 p14 = bottom.s12345678; + uchar8 p15 = bottom.s23456789; + uchar8 p16 = bottom.s3456789A; + uchar8 p17 = bottom.s456789AB; + uchar8 p18 = bottom2.s01234567; + uchar8 p19 = bottom2.s12345678; + uchar8 p20 = bottom2.s23456789; + + SORT(p0, p1); + SORT(p2, p3); + SORT(p4, p5); + SORT(p6, p7); + SORT(p8, p9); + SORT(p10, p11); + SORT(p12, p13); + SORT(p14, p15); + SORT(p16, p17); + SORT(p18, p19); + SORT(p0, p2); + SORT(p1, p3); + SORT(p4, p6); + SORT(p5, p7); + SORT(p8, p10); + SORT(p9, p11); + SORT(p12, p14); + SORT(p13, p15); + SORT(p16, p18); + SORT(p17, p19); + SORT(p1, p2); + SORT(p5, p6); + SORT(p0, p4); + SORT(p3, p7); + SORT(p9, p10); + SORT(p13, p14); + SORT(p8, p12); + SORT(p11, p15); + SORT(p17, p18); + SORT(p16, p20); + SORT(p1, p5); + SORT(p2, p6); + SORT(p9, p13); + SORT(p10, p14); + SORT(p0, p8); + SORT(p7, p15); + SORT(p17, p20); + SORT(p1, p4); + SORT(p3, p6); + SORT(p9, p12); + SORT(p11, p14); + SORT(p18, p20); + SORT(p0, p16); + SORT(p2, p4); + SORT(p3, p5); + SORT(p10, p12); + SORT(p11, p13); + SORT(p1, p9); + SORT(p6, p14); + SORT(p19, p20); + SORT(p3, p4); + SORT(p11, p12); + SORT(p1, p8); + SORT(p2, p10); + SORT(p5, p13); + SORT(p7, p14); + SORT(p3, p11); + SORT(p2, p8); + SORT(p4, p12); + SORT(p7, p13); + SORT(p1, p17); + SORT(p3, p10); + SORT(p5, p12); + SORT(p1, p16); + SORT(p2, p18); + SORT(p3, p9); + SORT(p6, p12); + SORT(p2, p16); + SORT(p3, p8); + SORT(p7, p12); + SORT(p5, p9); + SORT(p6, p10); + SORT(p4, p8); + SORT(p7, p11); + SORT(p3, p19); + SORT(p5, p8); + SORT(p7, p10); + SORT(p3, p18); + SORT(p4, p20); + SORT(p6, p8); + SORT(p7, p9); + SORT(p3, p17); + SORT(p5, p20); + SORT(p7, p8); + SORT(p3, p16); + SORT(p6, p20); + SORT(p5, p17); + SORT(p7, p20); + SORT(p4, p16); + SORT(p6, p18); + SORT(p5, p16); + SORT(p7, p19); + SORT(p7, p18); + SORT(p6, p16); + SORT(p7, p17); + SORT(p10, p18); + SORT(p7, p16); + SORT(p9, p17); + SORT(p8, p16); + SORT(p9, p16); + SORT(p10, p16); + + return p10; +} + +/** Sorting network to sort 8 boxes of size 5 and return their median. + * + * @param[in] top2 Values of elements two rows above. + * @param[in] top Values of elements one row above. + * @param[in] middle Values of middle elements. + * @param[in] bottom Values of elements one row below. + * @param[in] bottom2 Values of elements two rows below. + * + * @return Median values for 8 elements. + */ +inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) +{ + uchar8 p0 = top2.s01234567; + uchar8 p1 = top2.s12345678; + uchar8 p2 = top2.s23456789; + uchar8 p3 = top2.s3456789A; + uchar8 p4 = top2.s456789AB; + uchar8 p5 = top.s01234567; + uchar8 p6 = top.s12345678; + uchar8 p7 = top.s23456789; + uchar8 p8 = top.s3456789A; + uchar8 p9 = top.s456789AB; + uchar8 p10 = middle.s01234567; + uchar8 p11 = middle.s12345678; + uchar8 p12 = middle.s23456789; + uchar8 p13 = middle.s3456789A; + uchar8 p14 = middle.s456789AB; + uchar8 p15 = bottom.s01234567; + uchar8 p16 = bottom.s12345678; + uchar8 p17 = bottom.s23456789; + uchar8 p18 = bottom.s3456789A; + uchar8 p19 = bottom.s456789AB; + uchar8 p20 = bottom2.s01234567; + uchar8 p21 = bottom2.s12345678; + uchar8 p22 = bottom2.s23456789; + uchar8 p23 = bottom2.s3456789A; + uchar8 p24 = bottom2.s456789AB; + + SORT(p1, p2); + SORT(p0, p1); + SORT(p1, p2); + SORT(p4, p5); + SORT(p3, p4); + SORT(p4, p5); + SORT(p0, p3); + SORT(p2, p5); + SORT(p2, p3); + SORT(p1, p4); + SORT(p1, p2); + SORT(p3, p4); + SORT(p7, p8); + SORT(p6, p7); + SORT(p7, p8); + SORT(p10, p11); + SORT(p9, p10); + SORT(p10, p11); + SORT(p6, p9); + SORT(p8, p11); + SORT(p8, p9); + SORT(p7, p10); + SORT(p7, p8); + SORT(p9, p10); + SORT(p0, p6); + SORT(p4, p10); + SORT(p4, p6); + SORT(p2, p8); + SORT(p2, p4); + SORT(p6, p8); + SORT(p1, p7); + SORT(p5, p11); + SORT(p5, p7); + SORT(p3, p9); + SORT(p3, p5); + SORT(p7, p9); + SORT(p1, p2); + SORT(p3, p4); + SORT(p5, p6); + SORT(p7, p8); + SORT(p9, p10); + SORT(p13, p14); + SORT(p12, p13); + SORT(p13, p14); + SORT(p16, p17); + SORT(p15, p16); + SORT(p16, p17); + SORT(p12, p15); + SORT(p14, p17); + SORT(p14, p15); + SORT(p13, p16); + SORT(p13, p14); + SORT(p15, p16); + SORT(p19, p20); + SORT(p18, p19); + SORT(p19, p20); + SORT(p21, p22); + SORT(p23, p24); + SORT(p21, p23); + SORT(p22, p24); + SORT(p22, p23); + SORT(p18, p21); + SORT(p20, p23); + SORT(p20, p21); + SORT(p19, p22); + SORT(p22, p24); + SORT(p19, p20); + SORT(p21, p22); + SORT(p23, p24); + SORT(p12, p18); + SORT(p16, p22); + SORT(p16, p18); + SORT(p14, p20); + SORT(p20, p24); + SORT(p14, p16); + SORT(p18, p20); + SORT(p22, p24); + SORT(p13, p19); + SORT(p17, p23); + SORT(p17, p19); + SORT(p15, p21); + SORT(p15, p17); + SORT(p19, p21); + SORT(p13, p14); + SORT(p15, p16); + SORT(p17, p18); + SORT(p19, p20); + SORT(p21, p22); + SORT(p23, p24); + SORT(p0, p12); + SORT(p8, p20); + SORT(p8, p12); + SORT(p4, p16); + SORT(p16, p24); + SORT(p12, p16); + SORT(p2, p14); + SORT(p10, p22); + SORT(p10, p14); + SORT(p6, p18); + SORT(p6, p10); + SORT(p10, p12); + SORT(p1, p13); + SORT(p9, p21); + SORT(p9, p13); + SORT(p5, p17); + SORT(p13, p17); + SORT(p3, p15); + SORT(p11, p23); + SORT(p11, p15); + SORT(p7, p19); + SORT(p7, p11); + SORT(p11, p13); + SORT(p11, p12); + return p12; +} + +/** This function applies a non linear filter on a 5x5 box basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_box5x5( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar16 top2 = vload16(0, offset(&src, -2, -2)); + uchar16 top = vload16(0, offset(&src, -2, -1)); + uchar16 middle = vload16(0, offset(&src, -2, 0)); + uchar16 bottom = vload16(0, offset(&src, -2, 1)); + uchar16 bottom2 = vload16(0, offset(&src, -2, 2)); + + // Apply respective filter +#if defined MIN + uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2))); + uchar8 out = row_reduce_min_5(tmp); +#elif defined MAX + uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2))); + uchar8 out = row_reduce_max_5(tmp); +#elif defined MEDIAN + uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} + +/** This function applies a non linear filter on a 5x5 cross basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_cross5x5( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar16 top2 = vload16(0, offset(&src, 0, -2)); + uchar16 top = vload16(0, offset(&src, 0, -1)); + uchar16 middle = vload16(0, offset(&src, -2, 0)); + uchar16 bottom = vload16(0, offset(&src, 0, 1)); + uchar16 bottom2 = vload16(0, offset(&src, 0, 2)); + + // Apply respective filter +#if defined MIN + uchar8 tmp_middle = row_reduce_min_5(middle); + uchar8 out = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567))); +#elif defined MAX + uchar8 tmp_middle = row_reduce_max_5(middle); + uchar8 out = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567))); +#elif defined MEDIAN + uchar8 p0 = top2.s01234567; + uchar8 p1 = top.s01234567; + uchar8 p2 = middle.s01234567; + uchar8 p3 = middle.s12345678; + uchar8 p4 = middle.s23456789; + uchar8 p5 = middle.s3456789A; + uchar8 p6 = middle.s456789AB; + uchar8 p7 = bottom.s01234567; + uchar8 p8 = bottom2.s01234567; + uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} + +/** This function applies a non linear filter on a 5x5 disk basis on an input image. + * + * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_linear_filter_disk5x5( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load values + uchar16 top2 = vload16(0, offset(&src, -1, -2)); + uchar16 top = vload16(0, offset(&src, -2, -1)); + uchar16 middle = vload16(0, offset(&src, -2, 0)); + uchar16 bottom = vload16(0, offset(&src, -2, 1)); + uchar16 bottom2 = vload16(0, offset(&src, -1, 2)); + + // Apply respective filter +#if defined MIN + uchar16 tmp_3 = min(top2, bottom2); + uchar16 tmp_5 = min(middle, min(top, bottom)); + uchar8 tmp_3_red = row_reduce_min_3(tmp_3); + uchar8 tmp_5_red = row_reduce_min_5(tmp_5); + uchar8 out = min(tmp_3_red, tmp_5_red); +#elif defined MAX + uchar16 tmp_3 = max(top2, bottom2); + uchar16 tmp_5 = max(middle, max(top, bottom)); + uchar8 tmp_3_red = row_reduce_max_3(tmp_3); + uchar8 tmp_5_red = row_reduce_max_5(tmp_5); + uchar8 out = max(tmp_3_red, tmp_5_red); +#elif defined MEDIAN + uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2); +#else +#error "Unsupported filter function" +#endif + + // Store result + vstore8(out, 0, dst.ptr); +} diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h new file mode 100644 index 0000000000..77da2091b0 --- /dev/null +++ b/src/core/CL/cl_kernels/non_linear_filter_helpers.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** Sorts element-wise two vectors. + * + * @param[in, out] a First vector + * @param[in, out] b Second vector + */ +#define SORT(a, b) \ + { \ + uchar8 min_val = min(a, b); \ + uchar8 max_val = max(a, b); \ + a = min_val; \ + b = max_val; \ + } + +// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html + +/** Sorting network to sort 5 vectors of 8 elements and return their median. + * + * @param[in] p0 First element vector + * @param[in] p1 Second element vector + * @param[in] p2 Third element vector + * @param[in] p3 Fourth element vector + * @param[in] p4 Fifth element vector + * + * @return Median values for 8 elements. + */ +inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4) +{ + SORT(p0, p1); + SORT(p2, p3); + SORT(p0, p2); + SORT(p1, p3); + SORT(p1, p2); + SORT(p0, p4); + SORT(p1, p4); + SORT(p2, p4); + + return p2; +} + +/** Sorting network to sort 9 vectors of 8 elements and return their median. + * + * @param[in] p0 First element vector + * @param[in] p1 Second element vector + * @param[in] p2 Third element vector + * @param[in] p3 Fourth element vector + * @param[in] p4 Fifth element vector + * @param[in] p5 Sixth element vector + * @param[in] p6 Seventh element vector + * @param[in] p7 Eigth element vector + * @param[in] p8 Ninth element vector + * + * @return Median values for 8 elements. + */ +inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8) +{ + SORT(p1, p2); + SORT(p4, p5); + SORT(p7, p8); + SORT(p0, p1); + SORT(p3, p4); + SORT(p6, p7); + SORT(p1, p2); + SORT(p4, p5); + SORT(p7, p8); + SORT(p0, p3); + SORT(p5, p8); + SORT(p4, p7); + SORT(p3, p6); + SORT(p1, p4); + SORT(p2, p5); + SORT(p4, p7); + SORT(p4, p2); + SORT(p6, p4); + SORT(p4, p2); + + return p4; +} + +/** Calculate the minimum of a sliding window of size 3. + * + * @param val Values to calculate the minimum values + * + * @return Minimum values of 8 elements on a sliding window of size 3. + */ +inline uchar8 row_reduce_min_3(uchar16 val) +{ + return min(val.s01234567, min(val.s12345678, val.s23456789)); +} + +/** Calculate the maximum of a sliding window of size 3. + * + * @param val Values to calculate the maximum values + * + * @return Maximum values of 8 elements on a sliding window of size 3. + */ +inline uchar8 row_reduce_max_3(uchar16 val) +{ + return max(val.s01234567, max(val.s12345678, val.s23456789)); +} + +/** Calculate the minimum of a sliding window of size 5. + * + * @param val Values to calculate the minimum values + * + * @return Minimum values of 8 elements on a sliding window of size 5. + */ +inline uchar8 row_reduce_min_5(uchar16 val) +{ + return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB))); +} + +/** Calculate the maximum of a sliding window of size 5. + * + * @param val Values to calculate the maximum values + * + * @return Maximum values of 8 elements on a sliding window of size 5. + */ +inline uchar8 row_reduce_max_5(uchar16 val) +{ + return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB))); +} diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl new file mode 100644 index 0000000000..0e388d7496 --- /dev/null +++ b/src/core/CL/cl_kernels/nonmax.cl @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function performs Non maxima suppression over a 3x3 window on a given image. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: F32 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void non_max_suppression( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(DATA_TYPE, 8) + vc = vload8(0, (__global DATA_TYPE *)src.ptr); + + if(all(vc == (DATA_TYPE)0)) + { + vstore8(0, 0, (__global DATA_TYPE *)dst.ptr); + + return; + } + + VEC_DATA_TYPE(DATA_TYPE, 16) + nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1)); + VEC_DATA_TYPE(DATA_TYPE, 8) + out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789)); + + nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0)); + out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789)); + + nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1)); + out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789)); + + vstore8(out, 0, (__global DATA_TYPE *)dst.ptr); +} diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl new file mode 100644 index 0000000000..076b0d8909 --- /dev/null +++ b/src/core/CL/cl_kernels/normalization_layer.cl @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Apply cross map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16, F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] squared_input_ptr Pointer to the second source tensor. Supported data types: F16, F32 + * @param[in] squared_input_stride_x Stride of the second source tensor in X dimension (in bytes) + * @param[in] squared_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] squared_input_stride_y Stride of the second source tensor in Y dimension (in bytes) + * @param[in] squared_input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] squared_input_stride_z Stride of the second source tensor in Z dimension (in bytes) + * @param[in] squared_input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16, F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] coeff Alpha parameter / norm_size + * @param[in] beta Beta parameter in the normalization equation + * @param[in] kappa Kappa parameter in the normalization equation + * @param[in] radius Number of elements on the right or left side to normalize across + */ +__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(squared_input), + TENSOR3D_DECLARATION(output), + float coeff, + float beta, + float kappa, + uint radius) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + DATA_TYPE acc = 0; + + const int num_of_slices = get_global_size(2); + const int current_slice = get_global_id(2); + + const int left_slice = max(current_slice - (int)radius, (int)0); + const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1)); + + for(int i = left_slice; i <= right_slice; i++) + { + acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice); + } + + const float normalized = pow(kappa + coeff * (float)acc, beta); + + const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized; + + *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE); +} + +/** Apply in map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16, F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] squared_input_ptr Pointer to the second source tensor. Supported data types: F16, F32 + * @param[in] squared_input_stride_x Stride of the second source tensor in X dimension (in bytes) + * @param[in] squared_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] squared_input_stride_y Stride of the second source tensor in Y dimension (in bytes) + * @param[in] squared_input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] squared_input_stride_z Stride of the second source tensor in Z dimension (in bytes) + * @param[in] squared_input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16, F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] coeff Alpha parameter / norm_size + * @param[in] beta Beta parameter in the normalization equation + * @param[in] kappa Kappa parameter in the normalization equation + * @param[in] radius Number of elements on the right or left side to normalize across + */ +__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(squared_input), + TENSOR3D_DECLARATION(output), + float coeff, + float beta, + float kappa, + uint radius) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 4) + acc_vec = 0; + + const int current_pos = get_global_id(0) << 2; + + const int left_pos = max(current_pos - (int)radius, -3); + const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1)); + + for(int i = left_pos; i <= right_pos; i += 1) + { + acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0)); + } + + const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta); + + const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized; + + vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl new file mode 100644 index 0000000000..e1131d5573 --- /dev/null +++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "types.h" + +/* + *The criteria for lost tracking is that the spatial gradient matrix has: + * - Determinant less than DETERMINANT_THR + * - or minimum eigenvalue is smaller then EIGENVALUE_THR + * + * The thresholds for the determinant and the minimum eigenvalue is + * defined by the OpenVX spec + * + * Note: Also lost tracking happens when the point tracked coordinate is outside + * the image coordinates + * + * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html + */ + +/* Internal Lucas-Kanade Keypoint struct */ +typedef struct InternalKeypoint +{ + float x; /**< The x coordinate. */ + float y; /**< The y coordinate. */ + float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */ + float dummy; +} InternalKeypoint; + +/** Threshold for the determinant. Used for lost tracking criteria */ +#define DETERMINANT_THR 1.0e-07f + +/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */ +#define EIGENVALUE_THR 1.0e-04f + +/** Constants used for Lucas-Kanade Algorithm */ +#define W_BITS (14) +#define FLT_SCALE (1.0f / (float)(1 << 20)) +#define D0 ((float)(1 << W_BITS)) +#define D1 (1.0f / (float)(1 << (W_BITS - 5))) + +/** Initializes the internal new points array when the level of pyramid is NOT equal to max. + * + * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. + * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. + * @param[in] scale Scale factor to apply for the new_point coordinates. + */ +__kernel void init_level( + __global float4 *old_points_internal, + __global float4 *new_points_internal, + const float scale) +{ + int idx = get_global_id(0); + + // Get old and new keypoints + float4 old_point = old_points_internal[idx]; + float4 new_point = new_points_internal[idx]; + + // Scale accordingly with the pyramid_scale + old_point.xy *= (float2)(2.0f); + new_point.xy *= (float2)(2.0f); + + old_points_internal[idx] = old_point; + new_points_internal[idx] = new_point; +} + +/** Initializes the internal new points array when the level of pyramid is equal to max. + * + * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. + * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. + * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. + * @param[in] scale Scale factor to apply for the new_point coordinates. + */ +__kernel void init_level_max( + __global Keypoint *old_points, + __global InternalKeypoint *old_points_internal, + __global InternalKeypoint *new_points_internal, + const float scale) +{ + int idx = get_global_id(0); + + Keypoint old_point = old_points[idx]; + + // Get old keypoint to track + InternalKeypoint old_point_internal; + old_point_internal.x = old_point.x * scale; + old_point_internal.y = old_point.y * scale; + old_point_internal.tracking_status = 1.f; + + // Store internal keypoints + old_points_internal[idx] = old_point_internal; + new_points_internal[idx] = old_point_internal; +} + +/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1. + * + * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. + * @param[in] new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid. + * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. + * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. + * @param[in] scale Scale factor to apply for the new_point coordinates. + */ +__kernel void init_level_max_initial_estimate( + __global Keypoint *old_points, + __global Keypoint *new_points_estimates, + __global InternalKeypoint *old_points_internal, + __global InternalKeypoint *new_points_internal, + const float scale) +{ + int idx = get_global_id(0); + + Keypoint old_point = old_points[idx]; + Keypoint new_point_estimate = new_points_estimates[idx]; + InternalKeypoint old_point_internal; + InternalKeypoint new_point_internal; + + // Get old keypoint to track + old_point_internal.x = old_point.x * scale; + old_point_internal.y = old_point.y * scale; + old_point_internal.tracking_status = 1.f; + + // Get new keypoint to track + new_point_internal.x = new_point_estimate.x * scale; + new_point_internal.y = new_point_estimate.y * scale; + new_point_internal.tracking_status = new_point_estimate.tracking_status; + + // Store internal keypoints + old_points_internal[idx] = old_point_internal; + new_points_internal[idx] = new_point_internal; +} + +/** Truncates the coordinates stored in new_points array + * + * @param[in] new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid. + * @param[out] new_points An array of internal key points that are defined at the new_images high resolution pyramid. + */ +__kernel void finalize( + __global InternalKeypoint *new_points_internal, + __global Keypoint *new_points) +{ + int idx = get_global_id(0); + + // Load internal keypoint + InternalKeypoint new_point_internal = new_points_internal[idx]; + + // Calculate output point + Keypoint new_point; + new_point.x = round(new_point_internal.x); + new_point.y = round(new_point_internal.y); + new_point.tracking_status = new_point_internal.tracking_status; + + // Store new point + new_points[idx] = new_point; +} + +/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1. + * + * @param[in] old_image_ptr Pointer to the input old image. Supported data types: U8 + * @param[in] old_image_stride_x Stride of the input old image in X dimension (in bytes) + * @param[in] old_image_step_x old_image_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] old_image_stride_y Stride of the input old image in Y dimension (in bytes) + * @param[in] old_image_step_y old_image_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] old_image_offset_first_element_in_bytes The offset of the first element in the input old image + * @param[in] old_scharr_gx_ptr Pointer to the input scharr x image. Supported data types: S16 + * @param[in] old_scharr_gx_stride_x Stride of the input scharr x image in X dimension (in bytes) + * @param[in] old_scharr_gx_step_x old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] old_scharr_gx_stride_y Stride of the input scharr x image in Y dimension (in bytes) + * @param[in] old_scharr_gx_step_y old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image + * @param[in] old_scharr_gy_ptr Pointer to the input scharr y image. Supported data types: S16 + * @param[in] old_scharr_gy_stride_x Stride of the input scharr y image in X dimension (in bytes) + * @param[in] old_scharr_gy_step_x old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] old_scharr_gy_stride_y Stride of the input scharr y image in Y dimension (in bytes) + * @param[in] old_scharr_gy_step_y old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image + * @param[in] old_points An array of key points. Those key points are defined at the old_images high resolution pyramid + * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid + * @param[out] coeff It stores | A11 | A12 | A22 | min_eig | for each keypoint + * @param[out] iold_val It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint + * @param[in] window_dimension The size of the window on which to perform the algorithm + * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm + * @param[in] half_window The half size of the window on which to perform the algorithm + * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) + * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) + * @param[in] level0 It is set to 1 if level 0 of the pyramid + */ +void __kernel lktracker_stage0( + IMAGE_DECLARATION(old_image), + IMAGE_DECLARATION(old_scharr_gx), + IMAGE_DECLARATION(old_scharr_gy), + __global float4 *old_points, + __global float4 *new_points, + __global float4 *coeff, + __global short4 *iold_val, + const int window_dimension, + const int window_dimension_pow2, + const int half_window, + const float3 border_limits, + const float eig_const, + const int level0) +{ + int idx = get_global_id(0); + + Image old_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image); + Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx); + Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy); + + // Get old keypoint + float2 old_keypoint = old_points[idx].xy - (float2)half_window; + + // Get the floor value + float2 iold_keypoint = floor(old_keypoint); + + // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point + if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy)) + { + if(level0 == 1) + { + // Invalidate tracked point as we are at level 0 + new_points[idx].s2 = 0.0f; + } + + // Not valid coordinate. It sets min_eig to 0.0f + coeff[idx].s3 = 0.0f; + + return; + } + + // Compute weight for the bilinear interpolation + float2 ab = old_keypoint - iold_keypoint; + + // Weight used for Bilinear-Interpolation on Scharr images + // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y) + // w_scharr.s1 = ab.x * (1.0f - ab.y) + // w_scharr.s2 = (1.0f - ab.x) * ab.y + // w_scharr.s3 = ab.x * ab.y + + float4 w_scharr; + w_scharr.s3 = ab.x * ab.y; + w_scharr.s0 = w_scharr.s3 + 1.0f - ab.x - ab.y; + w_scharr.s12 = ab - (float2)w_scharr.s3; + + // Weight used for Bilinear-Interpolation on Old and New images + // w.s0 = round(w_scharr.s0 * D0) + // w.s1 = round(w_scharr.s1 * D0) + // w.s2 = round(w_scharr.s2 * D0) + // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2 + + float4 w; + w = round(w_scharr * (float4)D0); + w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation + + // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig + int4 iG = (int4)0; + + // Window offset + int window_offset = idx * window_dimension_pow2; + + // Compute Spatial Gradient Matrix G + for(ushort ky = 0; ky < window_dimension; ++ky) + { + int offset_y = iold_keypoint.y + ky; + for(ushort kx = 0; kx < window_dimension; ++kx) + { + int offset_x = iold_keypoint.x + kx; + float4 px; + + // Load values from old_image for computing the bilinear interpolation + px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)), + vload2(0, offset(&old_image, offset_x, offset_y + 1)))); + + // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy + float4 old_i; + + // Compute bilinear interpolation (with D1 scale factor) for ival + old_i.s0 = dot(px, w) * D1; + + // Load values from old_scharr_gx for computing the bilinear interpolation + px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)), + vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1)))); + + // Compute bilinear interpolation for ixval + old_i.s1 = dot(px, w_scharr); + + // Load values from old_scharr_gy for computing the bilinear interpolation + px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)), + vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1)))); + + // Compute bilinear interpolation for iyval + old_i.s2 = dot(px, w_scharr); + + // Rounding (it could be omitted. Used just for matching the VX implementation) + int4 iold = convert_int4(round(old_i)); + + // Accumulate values in the Spatial Gradient Matrix + iG.s0 += (int)(iold.s1 * iold.s1); + iG.s1 += (int)(iold.s1 * iold.s2); + iG.s2 += (int)(iold.s2 * iold.s2); + + // Store ival, ixval and iyval + iold_val[window_offset + kx] = convert_short4(iold); + } + window_offset += window_dimension; + } + + // Scale iA11, iA12 and iA22 + float4 G = convert_float4(iG) * (float4)FLT_SCALE; + + // Compute minimum eigen value + G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const; + + // Store A11. A11, A22 and min_eig + coeff[idx] = G; +} + +/** Computes the motion vector for a given keypoint + * + * @param[in] new_image_ptr Pointer to the input new image. Supported data types: U8 + * @param[in] new_image_stride_x Stride of the input new image in X dimension (in bytes) + * @param[in] new_image_step_x new_image_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] new_image_stride_y Stride of the input new image in Y dimension (in bytes) + * @param[in] new_image_step_y new_image_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] new_image_offset_first_element_in_bytes The offset of the first element in the input new image + * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid + * @param[in] coeff The | A11 | A12 | A22 | min_eig | for each keypoint + * @param[in] iold_val The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint + * @param[in] window_dimension The size of the window on which to perform the algorithm + * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm + * @param[in] half_window The half size of the window on which to perform the algorithm + * @param[in] num_iterations The maximum number of iterations + * @param[in] epsilon The value for terminating the algorithm. + * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) + * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) + * @param[in] level0 It is set to 1 if level of pyramid = 0 + * @param[in] term_iteration It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS + * @param[in] term_epsilon It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON + */ +void __kernel lktracker_stage1( + IMAGE_DECLARATION(new_image), + __global float4 *new_points, + __global float4 *coeff, + __global short4 *iold_val, + const int window_dimension, + const int window_dimension_pow2, + const int half_window, + const int num_iterations, + const float epsilon, + const float3 border_limits, + const float eig_const, + const int level0, + const int term_iteration, + const int term_epsilon) +{ + int idx = get_global_id(0); + Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image); + + // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig + float4 G = coeff[idx]; + + // Determinant + float D = G.s0 * G.s2 - G.s1 * G.s1; + + // Check if it is a good point to track + if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR) + { + if(level0 == 1) + { + // Invalidate tracked point as we are at level 0 + new_points[idx].s2 = 0; + } + + return; + } + + // Compute inverse + //D = native_recip(D); + D = 1.0 / D; + + // Get new keypoint + float2 new_keypoint = new_points[idx].xy - (float)half_window; + + // Get new point + float2 out_new_point = new_points[idx].xy; + + // Keep delta obtained in the previous iteration + float2 prev_delta = (float2)0.0f; + + int j = 0; + while(j < num_iterations) + { + // Get the floor value + float2 inew_keypoint = floor(new_keypoint); + + // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point + if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy)) + { + if(level0 == 1) + { + // Invalidate tracked point as we are at level 0 + new_points[idx].s2 = 0.0f; + } + else + { + new_points[idx].xy = out_new_point; + } + + return; + } + + // Compute weight for the bilinear interpolation + float2 ab = new_keypoint - inew_keypoint; + + // Weight used for Bilinear-Interpolation on Old and New images + // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0) + // w.s1 = round(ab.x * (1.0f - ab.y) * D0) + // w.s2 = round((1.0f - ab.x) * ab.y * D0) + // w.s3 = D0 - w.s0 - w.s1 - w.s2 + + float4 w; + w.s3 = ab.x * ab.y; + w.s0 = w.s3 + 1.0f - ab.x - ab.y; + w.s12 = ab - (float2)w.s3; + w = round(w * (float4)D0); + w.s3 = D0 - w.s0 - w.s1 - w.s2; + + // Mismatch vector + int2 ib = 0; + + // Old val offset + int old_val_offset = idx * window_dimension_pow2; + + for(int ky = 0; ky < window_dimension; ++ky) + { + for(int kx = 0; kx < window_dimension; ++kx) + { + // ival, ixval and iyval have been computed in the previous stage + int4 old_ival = convert_int4(iold_val[old_val_offset]); + + // Load values from old_image for computing the bilinear interpolation + float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)), + vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1)))); + + // Compute bilinear interpolation on new image + int jval = (int)round(dot(px, w) * D1); + + // Compute luminance difference + int diff = (int)(jval - old_ival.s0); + + // Accumulate values in mismatch vector + ib += (diff * old_ival.s12); + + // Update old val offset + old_val_offset++; + } + } + + float2 b = convert_float2(ib) * (float2)FLT_SCALE; + + // Optical Flow + float2 delta; + + delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D); + delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D); + + // Update new point coordinate + new_keypoint += delta; + + out_new_point = new_keypoint + (float2)half_window; + + if(term_epsilon == 1) + { + float mag2 = dot(delta, delta); + + if(mag2 <= epsilon) + { + new_points[idx].xy = out_new_point; + + return; + } + } + + // Check convergence analyzing the previous delta + if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f)) + { + out_new_point -= delta * (float2)0.5f; + + new_points[idx].xy = out_new_point; + + return; + } + + // Update previous delta + prev_delta = delta; + + if(term_iteration == 1) + { + j++; + } + } + + new_points[idx].xy = out_new_point; +} diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl new file mode 100644 index 0000000000..ae2031f422 --- /dev/null +++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +/** Performs a pixelwise multiplication with float scale of either integer or float inputs. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short + * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES. + * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. + * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_float( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out), + const float scale) +{ + // Get pixels pointer + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + + // Perform multiplication +#if defined DATA_TYPE_FLOAT + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); +#else + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND); +#endif + + // Store result + vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl new file mode 100644 index 0000000000..05c437cd17 --- /dev/null +++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifdef SATURATE +#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x)) +#else +#define CONVERT_OP_INT_STR(x, type) (convert_##type(x)) +#endif +#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type) + +/** Performs a pixelwise multiplication with integer scale of integer inputs. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short + * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES. + * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Integer scaling factor. Supported data types: S32 + */ +__kernel void pixelwise_mul_int( + IMAGE_DECLARATION(in1), + IMAGE_DECLARATION(in2), + IMAGE_DECLARATION(out), + const uint scale) +{ + // Get pixels pointer + Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); + Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + + // Perform multiplication and store result + vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl new file mode 100644 index 0000000000..1902df9b7d --- /dev/null +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined POOL_AVG +#define POOL_OP(x, y) ((x) + (y)) +#else +#define POOL_OP(x, y) (fmax((x), (y))) +#endif + +float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + int end_x = min(start_x + pool_size, upper_bound_w); + int end_y = min(start_y + pool_size, upper_bound_h); + return 1.f / ((end_y - start_y) * (end_x - start_x)); +} + +/** Performs a pooling function of pool size equal to 2. + * + * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3 + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; + * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) + * @param[in] strides The pooling operation strides in each dimension + * @param[in] paddings The pooling operation paddings in each dimension + */ +__kernel void pooling_layer_2( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output) +#ifdef POOL_AVG + , + int2 max_dims, int2 strides, int2 paddings +#endif +) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + // Load data + VEC_DATA_TYPE(DATA_TYPE, 2) + data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); + VEC_DATA_TYPE(DATA_TYPE, 2) + data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + data0 = POOL_OP(data0, data1); + DATA_TYPE res = POOL_OP(data0.s0, data0.s1); + + // Divide by 4 in case of average pooling +#ifdef POOL_AVG + res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); +#endif + + // Store result + *(__global DATA_TYPE *)output.ptr = res; +} + +/** Performs a pooling function of pool size equal to 3. + * + * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3 + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; + * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) + * @param[in] strides The pooling operation strides in each dimension + * @param[in] paddings The pooling operation paddings in each dimension + */ +__kernel void pooling_layer_3( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output) +#ifdef POOL_AVG + , + int2 max_dims, int2 strides, int2 paddings +#endif +) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + // Load data + VEC_DATA_TYPE(DATA_TYPE, 3) + data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); + VEC_DATA_TYPE(DATA_TYPE, 3) + data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); + VEC_DATA_TYPE(DATA_TYPE, 3) + data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); + + // Perform calculations + data0 = POOL_OP(data0, data1); + data0 = POOL_OP(data0, data2); + DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2); + + // Divide by 4 in case of average pooling +#ifdef POOL_AVG + res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); +#endif + + // Store result + *(__global DATA_TYPE *)output.ptr = res; +} diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl new file mode 100644 index 0000000000..e0f3bf3468 --- /dev/null +++ b/src/core/CL/cl_kernels/remap.cl @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_nearest_neighbour( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + map_coords += (float8)(0.5f); + + vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr); +} + +/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_bilinear( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + + vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr); +} diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl new file mode 100644 index 0000000000..9ef33b83ce --- /dev/null +++ b/src/core/CL/cl_kernels/scale.cl @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_nearest(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_bilinear(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + +/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16. + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] output_width Output image width + * @param[in] output_height Output image height + */ +__kernel void scale_nearest_neighbour( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float output_width, + const float output_height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(input_width / output_width, input_height / output_height); + const float8 tc = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height); + vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr); +} + +/** Performs an affine transformation on an image interpolating with the BILINEAR method. + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] output_width Output image width + * @param[in] output_height Output image height + */ +__kernel void scale_bilinear( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float output_width, + const float output_height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(input_width / output_width, input_height / output_height); + const float8 tc = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height); + vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr); +} diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl new file mode 100644 index 0000000000..ef9878c1a3 --- /dev/null +++ b/src/core/CL/cl_kernels/scharr_filter.cl @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This OpenCL kernel computes Scharr3x3. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void scharr3x3( + IMAGE_DECLARATION(src) +#ifdef GRAD_X + , + IMAGE_DECLARATION(dst_gx) +#endif +#ifdef GRAD_Y + , + IMAGE_DECLARATION(dst_gy) +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#ifdef GRAD_X + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + + // Output pixels +#ifdef GRAD_X + short8 gx = (short8)0; +#endif +#ifdef GRAD_Y + short8 gy = (short8)0; +#endif + + // Row0 + uchar16 temp = vload16(0, offset(&src, -1, -1)); + short8 left = convert_short8(temp.s01234567); + short8 middle = convert_short8(temp.s12345678); + short8 right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-3); + gx += right * (short8)(+3); +#endif +#ifdef GRAD_Y + gy += left * (short8)(-3); + gy += middle * (short8)(-10); + gy += right * (short8)(-3); +#endif + + // Row1 + temp = vload16(0, offset(&src, -1, 0)); + left = convert_short8(temp.s01234567); + right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-10); + gx += right * (short8)(+10); +#endif + + // Row2 + temp = vload16(0, offset(&src, -1, 1)); + left = convert_short8(temp.s01234567); + middle = convert_short8(temp.s12345678); + right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-3); + gx += right * (short8)(+3); +#endif +#ifdef GRAD_Y + gy += left * (short8)(+3); + gy += middle * (short8)(+10); + gy += right * (short8)(+3); +#endif + + // Store results +#ifdef GRAD_X + vstore8(gx, 0, ((__global short *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + vstore8(gy, 0, ((__global short *)dst_gy.ptr)); +#endif +} diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl new file mode 100644 index 0000000000..4eb0eef770 --- /dev/null +++ b/src/core/CL/cl_kernels/sobel_filter.cl @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/***********************************************/ +/* Begin implementation of Sobel3x3 filter */ +/***********************************************/ + +/** This OpenCL kernel that computes a Sobel3x3 filter. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void sobel3x3( + IMAGE_DECLARATION(src) +#ifdef GRAD_X + , + IMAGE_DECLARATION(dst_gx) +#endif +#ifdef GRAD_Y + , + IMAGE_DECLARATION(dst_gy) +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#ifdef GRAD_X + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + + // Output pixels +#ifdef GRAD_X + short8 gx = (short8)0; +#endif +#ifdef GRAD_Y + short8 gy = (short8)0; +#endif + + // Row0 + uchar16 temp = vload16(0, offset(&src, -1, -1)); + short8 left = convert_short8(temp.s01234567); + short8 middle = convert_short8(temp.s12345678); + short8 right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-1); + gx += right * (short8)(+1); +#endif +#ifdef GRAD_Y + gy += left * (short8)(-1); + gy += middle * (short8)(-2); + gy += right * (short8)(-1); +#endif + + // Row1 + temp = vload16(0, offset(&src, -1, 0)); + left = convert_short8(temp.s01234567); + right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-2); + gx += right * (short8)(+2); +#endif + + // Row2 + temp = vload16(0, offset(&src, -1, 1)); + left = convert_short8(temp.s01234567); + middle = convert_short8(temp.s12345678); + right = convert_short8(temp.s23456789); +#ifdef GRAD_X + gx += left * (short8)(-1); + gx += right * (short8)(+1); +#endif +#ifdef GRAD_Y + gy += left * (short8)(+1); + gy += middle * (short8)(+2); + gy += right * (short8)(+1); +#endif + + // Store results +#ifdef GRAD_X + vstore8(gx, 0, ((__global short *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + vstore8(gy, 0, ((__global short *)dst_gy.ptr)); +#endif +} + +/**********************************************/ +/* End implementation of Sobel3x3 filter */ +/**********************************************/ + +/***********************************************/ +/* Begin implementation of Sobel5x5 filter */ +/***********************************************/ + +/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] src Pointer to source image. + * @param[in] left1_coeff_gx Weight of the most left pixel for gx + * @param[in] left2_coeff_gx Weight of the left pixel for gx + * @param[in] middle_coeff_gx Weight of the middle pixel for gx + * @param[in] right1_coeff_gx Weight of the right pixel for gx + * @param[in] right2_coeff_gx Weight of the most right pixel for gx + * @param[in] left1_coeff_gy Weight of the most left pixel for gy + * @param[in] left2_coeff_gy Weight of the left pixel for gy + * @param[in] middle_coeff_gy Weight of the middle pixel for gy + * @param[in] right1_coeff_gy Weight of the right pixel for gy + * @param[in] right2_coeff_gy Weight of the most right pixel for gy + * + * @return a short16 containing short8 gx and short8 gy values. + */ +short16 sobel1x5( + Image *src, + const short left1_coeff_gx, + const short left2_coeff_gx, + const short middle_coeff_gx, + const short right1_coeff_gx, + const short right2_coeff_gx, + const short left1_coeff_gy, + const short left2_coeff_gy, + const short middle_coeff_gy, + const short right1_coeff_gy, + const short right2_coeff_gy) +{ + uchar16 temp = vload16(0, offset(src, -2, 0)); + short8 gx = 0; + short8 gy = 0; + short8 val; + + val = convert_short8(temp.s01234567); + gx += val * (short8)left1_coeff_gx; + gy += val * (short8)left1_coeff_gy; + + val = convert_short8(temp.s12345678); + gx += val * (short8)left2_coeff_gx; + gy += val * (short8)left2_coeff_gy; + + val = convert_short8(temp.s23456789); + gx += val * (short8)middle_coeff_gx; + gy += val * (short8)middle_coeff_gy; + + val = convert_short8(temp.s3456789a); + gx += val * (short8)right1_coeff_gx; + gy += val * (short8)right1_coeff_gy; + + val = convert_short8(temp.s456789ab); + gx += val * (short8)right2_coeff_gx; + gy += val * (short8)right2_coeff_gy; + + return (short16)(gx, gy); +} + +/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). + * + * @param[in] src Pointer to source image. + * @param[in] up1_coeff Weight of the most up pixel + * @param[in] up2_coeff Weight of the up pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] down1_coeff Weight of the down pixel + * @param[in] down2_coeff Weight of the most down pixel + * + * @return a short8 containing 8 convoluted values. + */ +short8 sobel5x1( + Image *src, + const short up1_coeff, + const short up2_coeff, + const short middle_coeff, + const short down1_coeff, + const short down2_coeff) +{ + short8 val; + short8 out = (short8)0; + + val = vload8(0, (__global short *)offset(src, 0, -2)); + out += val * (short8)up1_coeff; + + val = vload8(0, (__global short *)offset(src, 0, -1)); + out += val * (short8)up2_coeff; + + val = vload8(0, (__global short *)offset(src, 0, 0)); + out += val * (short8)middle_coeff; + + val = vload8(0, (__global short *)offset(src, 0, 1)); + out += val * (short8)down1_coeff; + + val = vload8(0, (__global short *)offset(src, 0, 2)); + out += val * (short8)down2_coeff; + + return (short8)(out); +} + +/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_ptr Pointer to the source image.. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image.. Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void sobel_separable1x5( + IMAGE_DECLARATION(src) +#ifdef GRAD_X + , + IMAGE_DECLARATION(dst_gx) +#endif +#ifdef GRAD_Y + , + IMAGE_DECLARATION(dst_gy) +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#ifdef GRAD_X + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + + // Output pixels + short16 gx_gy = sobel1x5(&src, + -1, -2, 0, 2, 1, + 1, 4, 6, 4, 1); + + // Store result in dst +#ifdef GRAD_X + vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr)); +#endif +} + +/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images + * and output two single channel S16 images. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_x_ptr Pointer to the source image.. Supported data types: S16 + * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] src_y_ptr Pointer to the source image. Supported data types: S16 + * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] dummy Dummy parameter to easy conditional inclusion + */ +__kernel void sobel_separable5x1( +#ifdef GRAD_X + IMAGE_DECLARATION(src_x), + IMAGE_DECLARATION(dst_gx), +#endif +#ifdef GRAD_Y + IMAGE_DECLARATION(src_y), + IMAGE_DECLARATION(dst_gy), +#endif + int dummy) +{ +#ifdef GRAD_X + Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x); + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y); + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + +#ifdef GRAD_X + short8 gx = sobel5x1(&src_x, + 1, 4, 6, 4, 1); + vstore8(gx, 0, ((__global short *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + short8 gy = sobel5x1(&src_y, + -1, -2, 0, 2, 1); + vstore8(gy, 0, ((__global short *)dst_gy.ptr)); +#endif +} + +/**********************************************/ +/* End implementation of Sobel5x5 filter */ +/**********************************************/ + +/***********************************************/ +/* Begin implementation of Sobel7x7 filter */ +/***********************************************/ + +/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */ +#define X0 -1 +#define X1 -4 +#define X2 -5 +#define X3 0 +#define X4 5 +#define X5 4 +#define X6 1 + +/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */ +#define Y0 1 +#define Y1 6 +#define Y2 15 +#define Y3 20 +#define Y4 15 +#define Y5 6 +#define Y6 1 + +/* Calculates single horizontal iteration. */ +#define SOBEL1x1_HOR(src, gx, gy, idx) \ + { \ + int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \ + gx += val * X##idx; \ + gy += val * Y##idx; \ + } + +/* Calculates single vertical iteration. */ +#define SOBEL1x1_VERT(src, g, direction, idx) \ + { \ + int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \ + g += val * (int8)direction##idx; \ + } + +/* Calculates a 1x7 horizontal iteration. */ +#define SOBEL1x7(ptr, gx, gy) \ + SOBEL1x1_HOR(ptr, gx, gy, 0) \ + SOBEL1x1_HOR(ptr, gx, gy, 1) \ + SOBEL1x1_HOR(ptr, gx, gy, 2) \ + SOBEL1x1_HOR(ptr, gx, gy, 3) \ + SOBEL1x1_HOR(ptr, gx, gy, 4) \ + SOBEL1x1_HOR(ptr, gx, gy, 5) \ + SOBEL1x1_HOR(ptr, gx, gy, 6) + +/* Calculates a 7x1 vertical iteration. */ +#define SOBEL7x1(ptr, g, direction) \ + SOBEL1x1_VERT(ptr, g, direction, 0) \ + SOBEL1x1_VERT(ptr, g, direction, 1) \ + SOBEL1x1_VERT(ptr, g, direction, 2) \ + SOBEL1x1_VERT(ptr, g, direction, 3) \ + SOBEL1x1_VERT(ptr, g, direction, 4) \ + SOBEL1x1_VERT(ptr, g, direction, 5) \ + SOBEL1x1_VERT(ptr, g, direction, 6) + +/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S32 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S32 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void sobel_separable1x7( + IMAGE_DECLARATION(src) +#ifdef GRAD_X + , + IMAGE_DECLARATION(dst_gx) +#endif +#ifdef GRAD_Y + , + IMAGE_DECLARATION(dst_gy) +#endif +) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#ifdef GRAD_X + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + int8 gx = (int8)0; + int8 gy = (int8)0; + + SOBEL1x7(&src, gx, gy); + + // Store result in dst +#ifdef GRAD_X + vstore8(gx, 0, ((__global int *)dst_gx.ptr)); +#endif +#ifdef GRAD_Y + vstore8(gy, 0, ((__global int *)dst_gy.ptr)); +#endif +} + +/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined. + * + * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient + * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. + * + * @param[in] src_x_ptr Pointer to the source image. Supported data types: S32 + * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] src_y_ptr Pointer to the source image. Supported data types: S32 + * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] dummy Dummy parameter to easy conditional inclusion + */ +__kernel void sobel_separable7x1( +#ifdef GRAD_X + IMAGE_DECLARATION(src_x), + IMAGE_DECLARATION(dst_gx), +#endif +#ifdef GRAD_Y + IMAGE_DECLARATION(src_y), + IMAGE_DECLARATION(dst_gy), +#endif + int dummy) +{ +#ifdef GRAD_X + Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x); + Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); +#endif +#ifdef GRAD_Y + Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y); + Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); +#endif + + // Output pixels +#ifdef GRAD_X + int8 gx = 0; + SOBEL7x1(&src_x, gx, Y); + vstore8(gx, 0, (__global int *)dst_gx.ptr); +#endif +#ifdef GRAD_Y + int8 gy = 0; + SOBEL7x1(&src_y, gy, X); + vstore8(gy, 0, (__global int *)dst_gy.ptr); +#endif +} + +/**********************************************/ +/* End implementation of Sobel7x7 filter */ +/**********************************************/ diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl new file mode 100644 index 0000000000..632b4a5374 --- /dev/null +++ b/src/core/CL/cl_kernels/softmax_layer.cl @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined USE_F16 +#define MINVAL HALF_MIN +#define SELECT_DATA_TYPE short +#define DATA_TYPE half +#else +#define MINVAL FLT_MIN +#define SELECT_DATA_TYPE int +#define DATA_TYPE float +#endif + +__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL); +__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +/** Identifies the maximum value across the 1st dimension. + * + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32. + * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed. + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width Input image width + */ +__kernel void softmax_layer_max( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint width) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Initialize local maximum + VEC_DATA_TYPE(DATA_TYPE, 16) + max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min; + + // Calculate max of row + const uint width4 = width >> 4; + for(uint i = 0; i < width4; i++) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0)); + max_val = max(data, max_val); + } + +#if defined NON_MULTIPLE_OF_16 + // Handle non multiple of 16 + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0)); + VEC_DATA_TYPE(SELECT_DATA_TYPE, 16) + widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)); + max_val = max(max_val, select(type_min, data, widx)); +#endif + + // Perform max reduction + max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF); + max_val.s0123 = max(max_val.s0123, max_val.s4567); + max_val.s01 = max(max_val.s01, max_val.s23); + max_val.s0 = max(max_val.s0, max_val.s1); + + // Store result + *((__global DATA_TYPE *)dst.ptr) = max_val.s0; +} + +/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel, + * then gets the exponent of each element as sums all elements across each row. + * + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32. + * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed. + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: F16, F32 + * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes) + * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes) + * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: F16, F32 + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[in] width Input image width + */ +__kernel void softmax_layer_shift_exp_sum( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(max), + IMAGE_DECLARATION(dst), + IMAGE_DECLARATION(sum), + uint width) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + Image max = CONVERT_TO_IMAGE_STRUCT(max); + Image sum = CONVERT_TO_IMAGE_STRUCT(sum); + + // Load max value of 1D logits vector (row) + DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0)); + + // Set sum vector + VEC_DATA_TYPE(DATA_TYPE, 16) + sum1D = 0; + + // Shift values, exp and sum + const uint width4 = width >> 4; + for(uint i = 0; i < width4; i++) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0)); + data = exp(data - max_val); + vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0)); + sum1D += data; + } + +#if defined NON_MULTIPLE_OF_16 + // Handle non multiple of 16 + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0)); + data = exp(data - max_val); + VEC_DATA_TYPE(SELECT_DATA_TYPE, 16) + widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)); + data = select(0, data, widx); + vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0)); + sum1D += data; +#endif + + // Perform min/max reduction + sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF; + sum1D.s0123 = sum1D.s0123 + sum1D.s4567; + sum1D.s01 = sum1D.s01 + sum1D.s23; + sum1D.s0 = sum1D.s0 + sum1D.s1; + + // Calculate and store result + *((__global DATA_TYPE *)sum.ptr) = sum1D.s0; +} + +/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. + * + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: F16, F32 + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void softmax_layer_norm( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(sum), + IMAGE_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum); + + // Load max value of 1D logits vector (row) + DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1))); + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)); + vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0)); +} diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl new file mode 100644 index 0000000000..cee116bd75 --- /dev/null +++ b/src/core/CL/cl_kernels/tablelookup.cl @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function performs table lookup on U8 input/output images. + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] lut LUT table. Supported data types: U8 + */ +__kernel void tablelookup_U8( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst), + __global uchar *lut) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load input data */ + uchar8 data = vload8(0, src.ptr); + + /* Load lut data */ + uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3], + lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]); + + /* Store result */ + vstore8(lut_data, 0, dst.ptr); +} + +/** This function performs table lookup on S16 input/output images. + * + * Global Workgroup Size [ DIV_CEIL(width, 8), height ] + * + * @param[in] src_ptr Pointer to the source image. Supported data types: S16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: S16 + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] lut LUT table. Supported data types: S16 + * @param[in] offset LUT offset + * @param[in] count Number of elements in the LUT + */ +__kernel void tablelookup_S16( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst), + __global short *lut, + uint offset, + uint count) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load input data */ + short8 data = vload8(0, (__global short *)src.ptr); + + /* Load output data */ + int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr)); + + /* Calculate index */ + int8 index = convert_int8(data) + (int8)(offset); + int8 cond = (index >= 0 && index < (int8)count); + index = select(0, index, cond); + + /* Load lut data */ + int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3], + lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]); + + /* Select output data depending on condition */ + lut_data = select(out_data, lut_data, cond); + + /* Store result */ + vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr); +} diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl new file mode 100644 index 0000000000..2b1e6ff35d --- /dev/null +++ b/src/core/CL/cl_kernels/threshold.cl @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Perform binary thresholding on an image. + * + * @param[in] in_ptr Pointer to the source image + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[out] out_ptr Pointer to the destination image + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] false_val False value + * @param[in] true_val True value + * @param[in] threshold The thresold value + */ +__kernel void threshold_binary( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const uchar false_val, + const uchar true_val, + const uchar threshold) +{ + // Get pixels pointer + Image in = CONVERT_TO_IMAGE_STRUCT(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + uchar16 in_data = vload16(0, in.ptr); + + // Perform binary thresholding + in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold); + + // Store result + vstore16(in_data, 0, out.ptr); +} + +/** Perform range thresholding on an image. + * + * @param[in] in_ptr Pointer to the source image + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[out] out_ptr Pointer to the destination image + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] false_val False value + * @param[in] true_val True value + * @param[in] lower Lower threshold + * @param[in] upper Upper threshold + */ +__kernel void threshold_range( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const uchar false_val, + const uchar true_val, + const uchar lower, + const uchar upper) +{ + // Get pixels pointer + Image in = CONVERT_TO_IMAGE_STRUCT(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + // Load data + uchar16 in_data = vload16(0, in.ptr); + + // Perform range thresholding + in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower); + + // Store result + vstore16(in_data, 0, out.ptr); +} diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl new file mode 100644 index 0000000000..c30158f280 --- /dev/null +++ b/src/core/CL/cl_kernels/transpose.cl @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define SWAP_ROW(u0, l0) \ + ({ \ + tmp_swap = u0; \ + u0 = l0; \ + l0 = tmp_swap; \ + }) + +#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + tmp_swap; \ + SWAP_ROW(u0, l0); \ + SWAP_ROW(u1, l1); \ + SWAP_ROW(u2, l2); \ + SWAP_ROW(u3, l3); \ + }) + +#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + tmp_swap; \ + SWAP_ROW(u0, l0); \ + SWAP_ROW(u1, l1); \ + SWAP_ROW(u2, l2); \ + SWAP_ROW(u3, l3); \ + SWAP_ROW(u4, l4); \ + SWAP_ROW(u5, l5); \ + SWAP_ROW(u6, l6); \ + SWAP_ROW(u7, l7); \ + }) + +#define TRANSPOSE_4x4(u0, u1, u2, u3) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + tmp; \ + tmp.s012 = u0.s123; \ + u0.s1 = u1.s0; \ + u0.s2 = u2.s0; \ + u0.s3 = u3.s0; \ + u1.s0 = tmp.s0; \ + u2.s0 = tmp.s1; \ + u3.s0 = tmp.s2; \ + \ + tmp.s01 = u1.s23; \ + u1.s2 = u2.s1; \ + u1.s3 = u3.s1; \ + u2.s1 = tmp.s0; \ + u3.s1 = tmp.s1; \ + \ + tmp.s0 = u2.s3; \ + u2.s3 = u3.s2; \ + u3.s2 = tmp.s0; \ + }) + +#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7) \ + ({ \ + TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123); \ + TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567); \ + TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123); \ + TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567); \ + SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \ + }) + +#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15) \ + ({ \ + TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567); \ + TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF); \ + TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \ + TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \ + SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF, \ + u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \ + }) + +#ifndef DATA_TYPE_IN_BYTES +#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel +#endif + +#if DATA_TYPE_IN_BYTES == 4 +#define DATA_TYPE uint +#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3) +#define VLOAD(x, y) vload4(x, y) +#define VSTORE(x, y, z) vstore4(x, y, z) +#define BLOCK_SIZE 4 +#elif DATA_TYPE_IN_BYTES == 2 +#define DATA_TYPE ushort +#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7) +#define VLOAD(x, y) vload8(x, y) +#define VSTORE(x, y, z) vstore8(x, y, z) +#define BLOCK_SIZE 8 +#elif DATA_TYPE_IN_BYTES == 1 +#define DATA_TYPE uchar +#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15) +#define VLOAD(x, y) vload16(x, y) +#define VSTORE(x, y, z) vstore16(x, y, z) +#define BLOCK_SIZE 16 +#else +#error DATA_TYPE_IN_BYTES not supported for transpose +#endif + +/** This OpenCL kernel computes the matrix transposition of input matrix + * + * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be: + * -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices + * -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices + * -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void transpose(IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + uint x = get_global_id(0) * BLOCK_SIZE; + uint y = get_global_id(1) * BLOCK_SIZE; + + // Compute source address + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + // Load the NxN block at (x, y) + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3))); +#if BLOCK_SIZE > 4 + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7))); +#if BLOCK_SIZE == 16 + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14))); + VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) + u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15))); +#endif /* BLOCK_SIZE == 16 */ +#endif /* BLOCK_SIZE > 4 */ + + // Transpose the block + TRANSPOSE(); + + // Store the block at (y, x) + uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes; + VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y)); + VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y)); + VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y)); + VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y)); +#if BLOCK_SIZE > 4 + VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y)); + VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y)); + VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y)); + VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y)); +#if BLOCK_SIZE == 16 + VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y)); + VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y)); + VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y)); + VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y)); + VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y)); + VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y)); + VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y)); + VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y)); +#endif /* BLOCK_SIZE == 16 */ +#endif /* BLOCK_SIZE > 4 */ +} diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h new file mode 100644 index 0000000000..87736465d2 --- /dev/null +++ b/src/core/CL/cl_kernels/types.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TYPES_H +#define ARM_COMPUTE_TYPES_H + +/** 2D Coordinates structure */ +typedef struct Coordinates2D +{ + int x; /**< The x coordinate. */ + int y; /**< The y coordinate. */ +} Coordinates2D; + +/* Keypoint struct */ +typedef struct Keypoint +{ + int x; /**< The x coordinate. */ + int y; /**< The y coordinate. */ + float strength; /**< The strength of the keypoint. Its definition is specific to the corner detector. */ + float scale; /**< Initialized to 0 by corner detectors. */ + float orientation; /**< Initialized to 0 by corner detectors. */ + int tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */ + float error; /**< A tracking method specific error. Initialized to 0 by corner detectors. */ +} Keypoint; + +/** Detection window struct */ +typedef struct DetectionWindow +{ + ushort x; /**< Top-left x coordinate */ + ushort y; /**< Top-left y coordinate */ + ushort width; /**< Width of the detection window */ + ushort height; /**< Height of the detection window */ + ushort idx_class; /**< Index of the class */ + float score; /**< Confidence value for the detection window */ +} DetectionWindow; +#endif // ARM_COMPUTE_TYPES_H diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl new file mode 100644 index 0000000000..0a4748f452 --- /dev/null +++ b/src/core/CL/cl_kernels/warp_affine.cl @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Returns a vector of floats contaning the matrix coefficients. */ +inline const float8 build_affine_mtx() +{ + return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0); +} + +/** Transforms 4 2D coordinates using the formula: + * + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * + * @param[in] coord 2D coordinate to transform. + * @param[in] mtx affine matrix + * + * @return a int8 containing 4 2D transformed values. + */ +inline const float8 apply_affine_transform(const float2 coord, const float8 mtx) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + // transform [x,x+1,x+2,x+3] + const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4))); + // transform [y,y+1,y+2,y+3] + const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5))); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + +/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. + * + * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation: + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * output(x,y) = input(x0,y0) + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_affine_nearest_neighbour( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr); +} + +/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_affine_bilinear( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr); +} diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h new file mode 100644 index 0000000000..26a8b859a4 --- /dev/null +++ b/src/core/CL/cl_kernels/warp_helpers.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Clamps the given coordinates to the borders. + * + * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. + * @param[in] width Width of the image + * @param[in] height Height of the image + * + */ +inline const float8 clamp_to_border(float8 coords, const float width, const float height) +{ + const float4 clamped_x = clamp(coords.even, -1.0f, width); + const float4 clamped_y = clamp(coords.odd, -1.0f, height); + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); +} + +/** Reads four texels from the input image. The coords vector is used to determine which texels to be read. + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of coordinates to be read from the image. + */ +inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords) +{ + return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)), + *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)), + *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)), + *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7))); +} + +/** Returns the current thread coordinates. */ +inline const float2 get_current_coords() +{ + return (float2)(get_global_id(0) * 4, get_global_id(1)); +} + +/** Given a texel coordinates this function will return the following array of coordinates: + * [ P, right neighbour, below neighbour, below right neighbour ] + * + * @note No checks to see if the coordinates are out of the image are done here. + * + * @param[in] coord Input coordinates + * + * @return vector of 8 floats with the coordinates, even positions are x and odd y. +*/ +inline const float8 get_neighbour_coords(const float2 coord) +{ + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); +} + +/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. + * @param[in] width Width of the image + * @param[in] height Height of the image +*/ +inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +{ + // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. + + // Sets the 4x4 coordinates for each of the four input texels + const float8 fc = floor(coords); + const float16 c1 = (float16)( + clamp_to_border(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height), + clamp_to_border(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height)); + const float16 c2 = (float16)( + clamp_to_border(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height), + clamp_to_border(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height)); + // Loads the values from the input image + const float16 t = (float16)( + /* tl, tr, bl, br */ + * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), + *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), + *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), + *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), + *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), + *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), + *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), + *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = (float4)( + ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4)); +} diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl new file mode 100644 index 0000000000..863b6c9e96 --- /dev/null +++ b/src/core/CL/cl_kernels/warp_perspective.cl @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Returns the perspective matrix */ +inline const float16 build_perspective_mtx() +{ + return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0); +} + +/** Transforms four 2D coordinates using the formula: + * + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * z0 = M[3][1] * x + M[3][2] * y + M[3][3] + * + * (x0/z0,y0/z0) + * + * @param[in] coord 2D coordinate to transform. + * @param[in] mtx perspective matrix + * + * @return a vector float8 containing four 2D transformed values. + */ +inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + // transform [z,z+1,z+2,z+3] + const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8))); + // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation + // transform [x,x+1,x+2,x+3] + const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z; + // transform [y,y+1,y+2,y+3] + const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z; + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + +/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. + * + * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation: + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * z0 = M[3][1] * x + M[3][2] * y + M[3][3] + + * output(x,y) = input(x0/z0,y0/z0) + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_perspective_nearest_neighbour( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr); +} + +/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_perspective_bilinear( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr); +} diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp new file mode 100644 index 0000000000..685b8e234e --- /dev/null +++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "The output image can only be U8 if both input images are U8"); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp new file mode 100644 index 0000000000..6333f04e71 --- /dev/null +++ b/src/core/CL/kernels/CLAccumulateKernel.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate")); + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted")); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, alpha); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(shift > 15); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared")); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, shift); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp new file mode 100644 index 0000000000..83bbe6a3be --- /dev/null +++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + // Set build options + std::set<std::string> build_opts; + build_opts.insert(("-D" + string_from_activation_func(act_info.activation()))); + build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT")))); + build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.insert(("-DA=" + val_to_string(act_info.a()))); + build_opts.insert(("-DB=" + val_to_string(act_info.b()))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp new file mode 100644 index 0000000000..aaa62d0268 --- /dev/null +++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> +#include <set> +#include <string> + +using namespace arm_compute; + +CLArithmeticAdditionKernel::CLArithmeticAdditionKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + + _input1 = input1; + _input2 = input2; + _output = output; + + const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Check for invalid combination + if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8)) + { + ARM_COMPUTE_ERROR("You called with the wrong data types."); + } + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp new file mode 100644 index 0000000000..4c847276da --- /dev/null +++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + // Check for invalid combination + if(output->info()->data_type() == DataType::U8) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + } + else + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + } + + _input1 = input1; + _input2 = input2; + _output = output; + + bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..309a153b7a --- /dev/null +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel() + : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0) +{ +} + +void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0)); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + _input = input; + _output = output; + _mean = mean; + _var = var; + _beta = beta; + _gamma = gamma; + _epsilon = epsilon; + + // Create kernel + std::string kernel_name = "batchnormalization_layer"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set kernel static arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, _epsilon); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + Window vector_slice = window.first_slice_window_1D(); + vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + add_1D_tensor_argument(idx, _mean, vector_slice); + add_1D_tensor_argument(idx, _var, vector_slice); + add_1D_tensor_argument(idx, _beta, vector_slice); + add_1D_tensor_argument(idx, _gamma, vector_slice); + + do + { + idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp new file mode 100644 index 0000000000..5ea4a86da5 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseAndKernel::CLBitwiseAndKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} +void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp new file mode 100644 index 0000000000..0098e15ab6 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp new file mode 100644 index 0000000000..2eeef0a993 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseOrKernel::CLBitwiseOrKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp new file mode 100644 index 0000000000..c19a78e1c4 --- /dev/null +++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLBitwiseXorKernel::CLBitwiseXorKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp new file mode 100644 index 0000000000..e113d30210 --- /dev/null +++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLBox3x3Kernel::border_size() const +{ + return 1; +} + +void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Set build options + std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1", + "-DMAT3=1", "-DMAT4=1", "-DMAT5=1", + "-DMAT6=1", "-DMAT7=1", "-DMAT8=1", + "-DSCALE=9", "-DDATA_TYPE_OUT=uchar" + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp new file mode 100644 index 0000000000..5d06d34631 --- /dev/null +++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGradientKernel::CLGradientKernel() + : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) +{ +} + +void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()), + "Gx and Gy must have the same pixel size"); + ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()), + "Mag must have the same pixel size as Gx and Gy"); + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type())); + + // Create kernel + const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2"); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access); + + mag_access.set_valid_region(win, _gx->info()->valid_region()); + phase_access.set_valid_region(win, _gx->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _gx, slice); + add_2D_tensor_argument(idx, _gy, slice); + add_2D_tensor_argument(idx, _magnitude, slice); + add_2D_tensor_argument(idx, _phase, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel() + : _magnitude(nullptr), _phase(nullptr), _output(nullptr) +{ +} + +BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const +{ + return BorderSize(1); +} + +void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32); + + _magnitude = magnitude; + _phase = phase; + _output = output; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts)); + + // Set minimum threshold argument + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, lower_thr); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_written_per_iteration = 3; + + Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, + num_elems_read_written_per_iteration, num_elems_read_written_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, mag_access, phase_access, output_access); + + output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _magnitude, slice); + add_2D_tensor_argument(idx, _phase, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLEdgeTraceKernel::CLEdgeTraceKernel() + : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr) +{ +} + +void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, + ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8); + + _input = input; + _output = output; + _lower_thr = lower_thr; + _upper_thr = upper_thr; + _visited = visited; + _recorded = recorded; + _l1_stack = l1_stack; + _l1_stack_counter = l1_stack_counter; + + // Create build opts + std::set<std::string> built_opts; + built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts)); + + // Set constant kernel args + unsigned int width = _input->info()->dimension(0); + unsigned int height = _input->info()->dimension(1); + unsigned int idx = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr)); + _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr)); + _kernel.setArg(idx++, static_cast<cl_uint>(width)); + _kernel.setArg(idx++, static_cast<cl_uint>(height)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration), + output_access, + visited_access, + recorded_access, + l1_stack_access, + l1_stack_counter_access); + + output_access.set_valid_region(win, _input->info()->valid_region()); + visited_access.set_valid_region(win, _input->info()->valid_region()); + recorded_access.set_valid_region(win, _input->info()->valid_region()); + l1_stack_access.set_valid_region(win, _input->info()->valid_region()); + l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _visited, slice); + add_2D_tensor_argument(idx, _recorded, slice); + add_2D_tensor_argument(idx, _l1_stack, slice); + add_2D_tensor_argument(idx, _l1_stack_counter, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp new file mode 100644 index 0000000000..d729ebcfb3 --- /dev/null +++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLChannelCombineKernel::CLChannelCombineKernel() + : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } } +{ +} + +void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); + + const Format fmt = output->info()->format(); + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + if(Format::RGBA8888 == fmt) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8); + _planes[3] = plane3; + } + else + { + _planes[3] = nullptr; + } + _output = output; + _output_multi = nullptr; + + // Half the processed elements for U,V channels due to sub-sampling of 2 + if(Format::YUYV422 == fmt || Format::UYVY422 == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + } + else + { + _x_subsampling = { { 1, 1, 1 } }; + _y_subsampling = { { 1, 1, 1 } }; + } + + // Create kernel + std::string kernel_name = "channel_combine_" + string_from_format(fmt); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(), + plane1->info()->valid_region(), + plane2->info()->valid_region()); + if(plane3 != nullptr) + { + valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region); + } + output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + _planes[3] = nullptr; + _output = nullptr; + _output_multi = output; + bool has_two_planars = false; + + // Set sub-sampling parameters for each plane + const Format fmt = output->info()->format(); + std::string kernel_name; + std::set<std::string> build_opts; + + if(Format::NV12 == fmt || Format::NV21 == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + kernel_name = "channel_combine_NV"; + build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21"); + has_two_planars = true; + } + else + { + if(Format::IYUV == fmt) + { + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + } + else + { + _x_subsampling = { { 1, 1, 1 } }; + _y_subsampling = { { 1, 1, 1 } }; + } + + kernel_name = "copy_planes_3p"; + build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + + update_window_and_padding(win, + input_plane0_access, input_plane1_access, input_plane2_access, + output_plane0_access, output_plane1_access, output_plane2_access); + + ValidRegion plane0_valid_region = plane0->info()->valid_region(); + ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region(); + output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + // Subsampling in plane 1 + Window win_sub_plane1(slice); + win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1])); + win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1)); + + // Subsampling in plane 2 + Window win_sub_plane2(slice); + win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2])); + win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1)); + + unsigned int idx = 0; + + // Set inputs + add_2D_tensor_argument(idx, _planes[0], slice); + add_2D_tensor_argument(idx, _planes[1], win_sub_plane1); + add_2D_tensor_argument(idx, _planes[2], win_sub_plane2); + + if(nullptr != _planes[3]) + { + add_2D_tensor_argument(idx, _planes[3], slice); + } + + // Set outputs + if(nullptr != _output) // Single planar output + { + add_2D_tensor_argument(idx, _output, slice); + } + else // Multi-planar output + { + // Reduce slice in case of subsampling to avoid out-of bounds access + slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1)); + + add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice); + add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1); + + if(3 == num_planes_from_format(_output_multi->info()->format())) + { + add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2); + } + + _kernel.setArg(idx++, slice.y().end()); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp new file mode 100644 index 0000000000..541153316a --- /dev/null +++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLChannelExtractKernel::CLChannelExtractKernel() + : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1) +{ +} + +void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output)); + + _input = input; + _output = output; + + // Check format + const Format format = input->info()->format(); + ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel); + + // Create kernel + std::string kernel_name = "channel_extract_" + string_from_format(format); + std::set<std::string> build_opts = { ("-DCHANNEL_" + string_from_channel(channel)) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Half the processed elements for U,V channels due to sub-sampling of 2 + _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1; + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration); + AccessWindowRectangle output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling); + + update_window_and_padding(win, input_access, output_access); + + ValidRegion input_valid_region = input->info()->valid_region(); + output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output)); + + // Get format + const Format fmt = input->info()->format(); + + // Get input plane + const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel)); + ARM_COMPUTE_ERROR_ON(nullptr == input_plane); + + _output = output; + _input = input_plane; + _subsampling = 1; + + // Create kernel + std::string kernel_name; + std::set<std::string> build_opts; + if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt) + { + kernel_name = "copy_plane"; + } + else + { + kernel_name = "channel_extract_" + string_from_format(fmt); + build_opts.insert(("-DCHANNEL_" + string_from_channel(channel))); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input_plane->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + Window win_sub(slice); + win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling)); + win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1)); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, win_sub); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp new file mode 100644 index 0000000000..ad66c39483 --- /dev/null +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cmath> + +using namespace arm_compute; + +CLCol2ImKernel::CLCol2ImKernel() + : _input(nullptr), _output(nullptr), _convolved_dims() +{ +} + +void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + _convolved_dims = convolved_dims; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + _kernel.setArg<cl_uint>(idx++, _convolved_dims.first); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_2D(); + Window slice_out = window.first_slice_window_3D(); + do + { + // Set inputs + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } + while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out)); +} diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp new file mode 100644 index 0000000000..ead2b8f092 --- /dev/null +++ b/src/core/CL/kernels/CLColorConvertKernel.cpp @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLMultiImage.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <sstream> + +using namespace arm_compute; + +CLColorConvertKernel::CLColorConvertKernel() + : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr) +{ +} + +void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + switch(input->info()->format()) + { + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::RGB888: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + case Format::UYVY422: + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + num_elems_processed_per_iteration = 8; + break; + default: + break; + } + break; + } + case Format::RGB888: + { + switch(output->info()->format()) + { + case Format::RGBA8888: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->format()) + { + case Format::NV12: + case Format::NV21: + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + num_elems_processed_per_iteration = 4; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _multi_input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + const bool has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); + const float sub_sampling = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling, sub_sampling); + AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling, sub_sampling); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + plane0_access, plane1_access, plane2_access, + output_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), + input->plane(2)->info()->valid_region()); + output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + unsigned int num_elems_processed_per_iteration = 0; + + bool has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); + float sub_sampling = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; + + switch(input->info()->format()) + { + case Format::RGB888: + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::NV12: + case Format::IYUV: + num_elems_processed_per_iteration = 2; + break; + case Format::YUV444: + num_elems_processed_per_iteration = 4; + break; + default: + break; + } + break; + } + case Format::UYVY422: + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::NV12: + case Format::IYUV: + num_elems_processed_per_iteration = 8; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _input = input; + _multi_output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444)) + { + win.set_dimension_step(Window::DimY, 2); + } + + AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, + num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_plane0_access, + output_plane1_access, + output_plane2_access); + + ValidRegion input_region = input->info()->valid_region(); + + output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output) +{ + unsigned int num_elems_processed_per_iteration = 0; + switch(input->info()->format()) + { + case Format::NV12: + case Format::NV21: + { + switch(output->info()->format()) + { + case Format::IYUV: + case Format::YUV444: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::YUV444: + case Format::NV12: + num_elems_processed_per_iteration = 16; + break; + default: + break; + } + break; + } + default: + break; + } + ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", + string_from_format(input->info()->format()).c_str(), + string_from_format(output->info()->format()).c_str()); + + std::stringstream kernel_name; + + kernel_name << string_from_format(input->info()->format()); + kernel_name << "_to_"; + kernel_name << string_from_format(output->info()->format()); + kernel_name << "_bt709"; + + _multi_input = input; + _multi_output = output; + + // Create kernel + bool has_two_input_planars = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); + bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); + + float sub_sampling_input = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; + float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str())); + + Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling_input, sub_sampling_input); + AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, + sub_sampling_input, sub_sampling_input); + AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); + AccessWindowRectangle output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0, + num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); + + update_window_and_padding(win, + input_plane0_access, input_plane1_access, input_plane2_access, + output_plane0_access, output_plane1_access, output_plane2_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), + input->plane(2)->info()->valid_region()); + output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + if(nullptr != _input && nullptr != _output) + { + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _input && nullptr != _multi_output) + { + Format format = _multi_output->info()->format(); + do + { + Window win_uv(slice); + + if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) + { + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + } + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv); + } + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _multi_input && nullptr != _output) + { + Format format = _multi_input->info()->format(); + do + { + Window win_uv(slice); + + if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) + { + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); + + for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv); + } + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else if(nullptr != _multi_input && nullptr != _multi_output) + { + Format in_format = _multi_input->info()->format(); + Format out_format = _multi_output->info()->format(); + do + { + Window win_in_uv(slice); + if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format)) + { + win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2, + win_in_uv.x().end() / 2, win_in_uv.x().step() / 2)); + win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1)); + } + unsigned int idx = 0; + add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv); + } + + Window win_out_uv(slice); + if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format)) + { + win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2, + win_out_uv.x().end() / 2, win_out_uv.x().step() / 2)); + win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1)); + } + + add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); + for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) + { + add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv); + } + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + } + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp new file mode 100644 index 0000000000..bdfe398a1d --- /dev/null +++ b/src/core/CL/kernels/CLConvolutionKernel.cpp @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +#define MAX_MATRIX_SIZE 81 + +/****************************************************************************************\ + * Square Convolution * +\****************************************************************************************/ + +template <unsigned int matrix_size> +BorderSize CLConvolutionKernel<matrix_size>::border_size() const +{ + return BorderSize(matrix_size / 2); +} + +template <unsigned int matrix_size> +void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(conv == nullptr); + + _input = input; + _output = output; + + std::stringstream kernel_name; + std::set<std::string> options; + kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static"; + + if(scale == 0) + { + scale = calculate_matrix_scale(conv, matrix_size); + } + + for(unsigned int i = 0; i < matrix_size * matrix_size; i++) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << conv[i]; + options.insert(mat_str.str()); + } + + options.insert("-DSCALE=" + val_to_string(scale)); + + DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size); + options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + + std::stringstream out_type; + out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + options.insert(out_type.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = matrix_size; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +/****************************************************************************************\ + * Separable Convolution * +\****************************************************************************************/ +template <unsigned int matrix_size> +CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel() + : _border_size(0) +{ +} + +template <unsigned int matrix_size> +BorderSize CLSeparableConvolutionHorKernel<matrix_size>::border_size() const +{ + return _border_size; +} + +template <unsigned int matrix_size> +void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32); + + ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2); + + // Set build options + std::set<std::string> build_opts; + + int16_t mat[matrix_size * matrix_size] = { 0 }; + memcpy(mat, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < matrix_size * matrix_size; j++) + { + build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + build_opts.insert("-DSCALE=0"); + + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +template <unsigned int matrix_size> +BorderSize CLSeparableConvolutionVertKernel<matrix_size>::border_size() const +{ + return BorderSize(matrix_size / 2, 0); +} + +template <unsigned int matrix_size> +void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, + const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); + ARM_COMPUTE_ERROR_ON(scale == 0); + + _input = input; + _output = output; + + std::set<std::string> build_opts; + + int16_t mat[matrix_size * matrix_size] = { 0 }; + memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < matrix_size * matrix_size; j++) + { + build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + build_opts.insert("-DSCALE=" + val_to_string(scale)); + + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + + build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type)); + + std::stringstream out_type; + out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + build_opts.insert(out_type.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = matrix_size; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +/****************************************************************************************\ + * Rectangle Convolution * +\****************************************************************************************/ + +CLConvolutionRectangleKernel::CLConvolutionRectangleKernel() + : _border_size(0), _input(nullptr), _output(nullptr) +{ +} + +BorderSize CLConvolutionRectangleKernel::border_size() const +{ + return _border_size; +} + +void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(nullptr == conv); + ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width); + ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height); + ARM_COMPUTE_ERROR_ON(0 == scale); + + _input = input; + _output = output; + _border_size = BorderSize(height / 2, width / 2); + + std::set<std::string> options; + + std::stringstream output_type; + output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); + options.insert(output_type.str()); + + uint32_t matrix_size = width * height; + + int16_t mat[MAX_MATRIX_SIZE] = { 0 }; + + memcpy(mat, conv, matrix_size * sizeof(int16_t)); + + for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++) + { + options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j])); + } + + options.insert("-DSCALE=" + val_to_string(scale)); + + DataType data_type = data_type_for_convolution_matrix(conv, matrix_size); + options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + + options.insert("-DMATRIX_WIDTH=" + val_to_string(width)); + options.insert("-DMATRIX_HEIGHT=" + val_to_string(height)); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + const unsigned int num_rows_read_per_iteration = height; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +template class arm_compute::CLConvolutionKernel<3>; +template class arm_compute::CLConvolutionKernel<5>; +template class arm_compute::CLConvolutionKernel<7>; +template class arm_compute::CLConvolutionKernel<9>; +template class arm_compute::CLSeparableConvolutionVertKernel<5>; +template class arm_compute::CLSeparableConvolutionVertKernel<7>; +template class arm_compute::CLSeparableConvolutionVertKernel<9>; +template class arm_compute::CLSeparableConvolutionHorKernel<5>; +template class arm_compute::CLSeparableConvolutionHorKernel<7>; +template class arm_compute::CLSeparableConvolutionHorKernel<9>; diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp new file mode 100644 index 0000000000..73f1ba15df --- /dev/null +++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLDepthConcatenateKernel::CLDepthConcatenateKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) +{ +} + +BorderSize CLDepthConcatenateKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth")); + + // Configure kernel window + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom * + output->info()->strides_in_bytes()[1]; + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = 4; + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes); + + ICLKernel::configure(win); +} + +void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp new file mode 100644 index 0000000000..24608bd17c --- /dev/null +++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <string> + +using namespace arm_compute; + +void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON(input == output); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different"); + ARM_COMPUTE_ERROR_ON(shift >= 8); + + // Check if convertion is supported + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16 + && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U8 -> [out] U16, S16, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] U16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32 + && output->info()->data_type() != DataType::S32), + "Only data types supported [in] S16 -> [out] U8, U32, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] U32 -> [out] U8, U16, S16"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S16), + "Only data types supported [in] S32 -> [out] U8, U16, S16"); + + // Get data sizes + const size_t input_size = data_size_from_type(input->info()->data_type()); + const size_t output_size = data_size_from_type(output->info()->data_type()); + + // Construct kernel name and build options + std::string kernel_name = "convert_depth"; + std::set<std::string> build_opts; + if(input_size > output_size) + { + kernel_name += "_down"; + build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE"); + } + else + { + kernel_name += "_up"; + } + build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set shift arg + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, shift); + + // Configure kernel + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp new file mode 100644 index 0000000000..36ba06d528 --- /dev/null +++ b/src/core/CL/kernels/CLDerivativeKernel.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLDerivativeKernel::CLDerivativeKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false) +{ +} + +BorderSize CLDerivativeKernel::border_size() const +{ + return BorderSize(1); +} + +void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_derivative_x = output_x != nullptr; + _run_derivative_y = output_y != nullptr; + + if(_run_derivative_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_derivative_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_derivative_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_derivative_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_read_rows_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), 0, 0, 0, 0); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration); + if(_run_derivative_x && _run_derivative_y) + { + input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration); + } + else if(_run_derivative_x) + { + input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration); + } + else if(_run_derivative_y) + { + input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration); + } + + update_window_and_padding(win, + input_access, + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_derivative_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_derivative_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp new file mode 100644 index 0000000000..3abd747011 --- /dev/null +++ b/src/core/CL/kernels/CLDilateKernel.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDilateKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLDilateKernel::border_size() const +{ + return BorderSize(1); +} + +void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate")); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp new file mode 100644 index 0000000000..a7aa88fc5c --- /dev/null +++ b/src/core/CL/kernels/CLErodeKernel.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLErodeKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLErodeKernel::border_size() const +{ + return BorderSize(1); +} + +void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode")); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_pes_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp new file mode 100644 index 0000000000..1d4d776730 --- /dev/null +++ b/src/core/CL/kernels/CLFastCornersKernel.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLFastCornersKernel::CLFastCornersKernel() + : ICLKernel(), _input(nullptr), _output(nullptr) +{ +} + +BorderSize CLFastCornersKernel::border_size() const +{ + return BorderSize(3); +} + +void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented"); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> build_opts; + + if(non_max_suppression) + { + build_opts.emplace("-DUSE_MAXSUPPRESSION"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx, static_cast<float>(threshold)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 7; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3)); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size()); + + ICLKernel::configure(win); +} + +void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLCopyToArrayKernel::CLCopyToArrayKernel() + : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr) +{ +} + +void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(corners == nullptr); + ARM_COMPUTE_ERROR_ON(num_buffers == nullptr); + + _input = input; + _corners = corners; + _num_buffer = num_buffers; + + std::set<std::string> build_opts; + + if(update_number) + { + build_opts.emplace("-DUPDATE_NUMBER"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts)); + + //Get how many pixels skipped in the x dimension in the previous stages + unsigned int offset = _input->info()->valid_region().anchor.x(); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters + _kernel.setArg<unsigned int>(idx++, corners->max_num_values()); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg(idx++, *_num_buffer); + _kernel.setArg(idx++, _corners->cl_buffer()); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + //Initialise the _num_buffer as it used as both input and output + static const unsigned int zero_init = 0; + queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp new file mode 100644 index 0000000000..981aad665a --- /dev/null +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstdint> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLFillBorderKernel::CLFillBorderKernel() + : ICLKernel(), _tensor(nullptr) +{ +} + +bool CLFillBorderKernel::is_parallelisable() const +{ + return false; +} + +template <class T> +void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value) +{ + T value; + constant_border_value.get(value); + ICLKernel::add_argument<T>(idx, static_cast<T>(value)); +} + +void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1); + + border_size.limit(tensor->info()->padding()); + + // If there is no border: early exit + if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + { + return; + } + + // Select appropriate kernel + std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode)); + + // Define select type required by replicate border > 1 + const DataType dt = tensor->info()->data_type(); + std::string select_type = get_cl_type_from_data_type(dt); + if(is_data_type_float(dt)) + { + select_type = (DataType::F32 == dt) ? "int" : "short"; + } + + // Define build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace(("-DSELECT_TYPE=" + select_type)); + build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top))); + build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom))); + build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left))); + build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + _tensor = tensor; + + // Create static kernel arguments + const unsigned int valid_width = tensor->info()->valid_region().shape[0]; + const unsigned int valid_height = tensor->info()->valid_region().shape[1]; + const cl_int2 valid_region_coords = + { + { + static_cast<cl_int>(tensor->info()->valid_region().anchor[0]), + static_cast<cl_int>(tensor->info()->valid_region().anchor[1]), + } + }; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters + ICLKernel::add_argument<cl_uint>(idx, valid_width); + ICLKernel::add_argument<cl_uint>(idx, valid_height); + ICLKernel::add_argument<cl_int2>(idx, valid_region_coords); + if(BorderMode::CONSTANT == border_mode) + { + switch(dt) + { + case DataType::U8: + set_constant_border<uint8_t>(idx, constant_border_value); + break; + case DataType::U16: + set_constant_border<uint16_t>(idx, constant_border_value); + break; + case DataType::S16: + set_constant_border<int16_t>(idx, constant_border_value); + break; + case DataType::U32: + set_constant_border<uint32_t>(idx, constant_border_value); + break; + case DataType::S32: + set_constant_border<int32_t>(idx, constant_border_value); + break; + case DataType::F32: + static_assert(sizeof(float) == 4, "Float must be 32 bit"); + set_constant_border<float>(idx, constant_border_value); + break; + case DataType::F16: + static_assert(sizeof(cl_half) == 2, "Half must be 16 bit"); + set_constant_border<cl_half>(idx, constant_border_value); + break; + default: + ARM_COMPUTE_ERROR("Not handled"); + } + } + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + win.use_tensor_dimensions(tensor->info(), Window::DimZ); + ICLKernel::configure(win); +} + +void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) +{ + // Border mode undefined or border width == 0 + if(_kernel() == nullptr) + { + return; + } + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _tensor, slice); + enqueue(queue, *this, slice, cl::NullRange); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..71d42c5606 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f)); + + _input = input; + _output = output; + + // Create kernel + std::string data_type_name; + data_type_name = val_to_string(input->info()->element_size() * 8) + "bit"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] + */ + Window in_slice = window.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + // Change x and y steps for the slide of output tensor + out_slice.scale(Window::DimX, 4.f); + out_slice.scale(Window::DimY, 0.25f); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..c6e05b92a2 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> +#include <cstdint> +#include <tuple> + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, + int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + + _input0 = input0; + _input1 = input1; + _output = output; + + // Create kernel and set static arguments + std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts)); + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<int32_t>(idx++, a_offset); + _kernel.setArg<int32_t>(idx++, b_offset); + _kernel.setArg<int32_t>(idx++, output_offset); + _kernel.setArg<int32_t>(idx++, output_mult_int); + _kernel.setArg<int32_t>(idx++, shift); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + constexpr unsigned int num_elems_read_per_iteration_input0 = 4; + constexpr unsigned int num_elems_read_per_iteration_input1 = 16; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + Window slice_matrix_b = slice; + slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1)); + slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() < 3) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_2D_tensor_argument(idx, _input1, slice_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 0000000000..289873c23f --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + + _biases = biases; + _accum = accum; + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type()); + + Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1)); + AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, biases_access, accum_access); + + ICLKernel::configure(win); +} + +void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window accum_slice = window.first_slice_window_2D(); + + Window biases_slice(accum_slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_2D_tensor_argument(idx, _accum, accum_slice); + add_1D_tensor_argument(idx, _biases, biases_slice); + + enqueue(queue, *this, accum_slice); + } + while(window.slide_window_slice_2D(accum_slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..343838f2f9 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + _input = input; + _output = output; + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + + std::ostringstream ma_arguments; + ma_arguments << "-DBETA=" << beta; + std::set<std::string> build_opts; + build_opts.emplace(ma_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..d7388e8579 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/AccessWindowTranspose.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + if(output->info()->dimension(1) == 1) + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + } + + _input0 = input0; + _input1 = input1; + _output = output; + + if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } + + std::ostringstream mm_arguments; + mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " "; + mm_arguments << "-DALPHA=" << alpha << " "; + std::set<std::string> build_opts; + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if(output->info()->dimension(1) == 1) + { + mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " "; + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts)); + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); + } + else + { + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + + if(data_type_name == "f32") + { + GPUTarget arch_target = get_arch_from_target(get_target()); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts)); + } + else + { + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts)); + } + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); + } +} + +void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + Window slice_matrix_b = slice; + slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1)); + slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() < 3) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_2D_tensor_argument(idx, _input1, slice_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp new file mode 100644 index 0000000000..ecee1abd72 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> + +using namespace arm_compute; + +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t transpose_w = 16 / input->info()->element_size(); + output_shape.set(0, input->info()->dimension(1) * transpose_w); + output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w))))); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type()); + + /* + * Following an example of how the transposition1xW works when the input data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ] + * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ] + */ + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type())); + std::string kernel_name = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + float scale_x = 1.f; + + switch(input->info()->data_type()) + { + case DataType::U8: + scale_x = 16.f; + break; + case DataType::F16: + scale_x = 8.f; + break; + case DataType::F32: + scale_x = 4.f; + break; + default: + // Do nothing + break; + } + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Output is transposed + Window out_window(window); + out_window.set(Window::DimX, window.y()); + out_window.set(Window::DimY, window.x()); + + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, _lws_hint); + } + while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp new file mode 100644 index 0000000000..e5bc3f9656 --- /dev/null +++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLGaussian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Set build options + std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1", + "-DMAT3=2", "-DMAT4=4", "-DMAT5=2", + "-DMAT6=1", "-DMAT7=2", "-DMAT8=1", + "-DSCALE=16", "-DDATA_TYPE_OUT=uchar" + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp new file mode 100644 index 0000000000..bd523c883d --- /dev/null +++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h" + +#include <cstdint> + +using namespace arm_compute; + +void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + const int16_t matrix[] = { 1, 4, 6, 4, 1 }; + + // Set arguments + CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined); +} + +void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + const uint32_t scale = 256; + const int16_t matrix[] = { 1, 4, 6, 4, 1 }; + + // Set arguments + CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined); +} diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp new file mode 100644 index 0000000000..34a228c717 --- /dev/null +++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel() + : _border_size(0), _l2_load_offset(0) +{ +} + +BorderSize CLGaussianPyramidHorKernel::border_size() const +{ + return _border_size; +} + +void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 20; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr float scale_x = 0.5f; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x); + + // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even + // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether + // a pixel is even or odd is determined based on the tensor shape not the + // valid region!) + // Thus the offset from which the first pixel (L2) for the convolution is + // loaded depends on the anchor and shape of the valid region. + // In the case of an even shape (= even image width) we need to load L2 + // from -2 if the anchor is odd and from -1 if the anchor is even. That + // makes sure that L2 is always loaded from an odd pixel. + // On the other hand, for an odd shape (= odd image width) we need to load + // L2 from -1 if the anchor is odd and from -2 if the anchor is even to + // achieve the opposite effect. + // The condition can be simplified to checking whether anchor + shape is + // odd (-2) or even (-1) as only adding an odd and an even number will have + // an odd result. + _l2_load_offset = -border_size().left; + + if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0) + { + _l2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f)); + valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window win_in(window); + win_in.shift(Window::DimX, _l2_load_offset); + + //The output is half the width of the input: + Window win_out(window); + win_out.scale(Window::DimX, 0.5f); + + Window slice_in = win_in.first_slice_window_2D(); + Window slice_out = win_out.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); +} + +CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel() + : _t2_load_offset(0) +{ +} + +BorderSize CLGaussianPyramidVertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_rows_processed_per_iteration = 2; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_per_iteration = 5; + constexpr float scale_y = 0.5f; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), + border_undefined, border_size()); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y); + + // Determine whether we need to load even or odd rows. See above for a + // detailed explanation. + _t2_load_offset = -border_size().top; + + if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0) + { + _t2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f)); + valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(window.x().step() != 8); + ARM_COMPUTE_ERROR_ON(window.y().step() % 2); + + Window win_in(window); + win_in.shift(Window::DimY, _t2_load_offset); + + Window win_out(window); + win_out.scale(Window::DimY, 0.5f); + + Window slice_in = win_in.first_slice_window_2D(); + Window slice_out = win_out.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); +} diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp new file mode 100644 index 0000000000..87659c4ba9 --- /dev/null +++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel() + : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size() +{ +} + +void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX)); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY)); + + _input_magnitude = input_magnitude; + _input_phase = input_phase; + _output = output; + _cell_size = hog_info->cell_size(); + + float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f); + phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f); + + std::stringstream args_str; + args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " "; + args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " "; + args_str << "-DNUM_BINS=" << hog_info->num_bins() << " "; + args_str << "-DPHASE_SCALE=" << phase_scale << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts)); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height; + constexpr unsigned int num_elems_written_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + // Compute slice for the magnitude and phase tensors + Window slice_mag_phase = window.first_slice_window_2D(); + slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width)); + slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height)); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase); + add_2D_tensor_argument(idx, _input_phase, slice_mag_phase); + add_2D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel() + : _input(nullptr), _output(nullptr), _num_cells_per_block_stride() +{ +} + +void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); + + // Number of cells per block + const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width, + hog_info->block_size().height / hog_info->cell_size().height); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32); + + // Number of cells per block stride + const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width, + hog_info->block_stride().height / hog_info->cell_size().height); + + _input = input; + _output = output; + _num_cells_per_block_stride = num_cells_per_block_stride; + + std::stringstream args_str; + args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " "; + args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " "; + args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " "; + args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " "; + args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " "; + args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " "; + args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " "; + args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts)); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = num_cells_per_block.height; + constexpr unsigned int num_elems_written_per_iteration = 1; + const unsigned int num_rows_written_per_iteration = num_cells_per_block.height; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + // Compute slice for the magnitude and phase tensors + Window slice_in = window.first_slice_window_2D(); + slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width); + slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp new file mode 100644 index 0000000000..0f9a98950d --- /dev/null +++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLHOG.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLHOGDetectorKernel::CLHOGDetectorKernel() + : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr) +{ +} + +void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, + float threshold, uint16_t idx_class) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32); + ARM_COMPUTE_ERROR_ON(hog == nullptr); + ARM_COMPUTE_ERROR_ON(detection_windows == nullptr); + ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr); + ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0); + ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0); + + const Size2D &detection_window_size = hog->info()->detection_window_size(); + const Size2D &block_size = hog->info()->block_size(); + const Size2D &block_stride = hog->info()->block_stride(); + + _input = input; + _detection_windows = detection_windows; + _num_detection_windows = num_detection_windows; + + const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels(); + const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1; + + ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size()); + + std::stringstream args_str; + args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " "; + args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " "; + args_str << "-DTHRESHOLD=" << threshold << " "; + args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " "; + args_str << "-DIDX_CLASS=" << idx_class << " "; + args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " "; + args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " "; + args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " "; + args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " "; + + // Construct kernel name + std::set<std::string> build_opts = {}; + build_opts.insert(args_str.str()); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters + _kernel.setArg(idx++, hog->cl_buffer()); + _kernel.setArg(idx++, detection_windows->cl_buffer()); + _kernel.setArg(idx++, *_num_detection_windows); + + // Get the number of blocks along the x and y directions of the input tensor + const ValidRegion &valid_region = input->info()->valid_region(); + const size_t num_blocks_x = valid_region.shape[0]; + const size_t num_blocks_y = valid_region.shape[1]; + + // Get the number of blocks along the x and y directions of the detection window + const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width; + const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height; + + const size_t window_step_x = detection_window_stride.width / block_stride.width; + const size_t window_step_y = detection_window_stride.height / block_stride.height; + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x)); + win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y)); + + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y; + + update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration)); + + ICLKernel::configure(win); +} + +void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp new file mode 100644 index 0000000000..9fc34a7760 --- /dev/null +++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLHarrisScoreKernel::CLHarrisScoreKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0) +{ +} + +BorderSize CLHarrisScoreKernel::border_size() const +{ + return _border_size; +} + +void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output, + int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); + ARM_COMPUTE_ERROR_ON(0.0f == norm_factor); + + _input1 = input1; + _input2 = input2; + _output = output; + _sensitivity = sensitivity; + _strength_thresh = strength_thresh; + _norm_factor = norm_factor; + _border_size = BorderSize(block_size / 2); + + // Select kernel + std::stringstream harris_score_kernel_name; + harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size; + + // Create build options + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts)); + + // Set static kernel arguments + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, sensitivity); + _kernel.setArg(idx++, strength_thresh); + _kernel.setArg(idx++, norm_factor); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + constexpr unsigned int num_elems_written_per_iteration = 4; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region()); + output_access.set_valid_region(win, valid_region, border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp new file mode 100644 index 0000000000..87ee5fb74e --- /dev/null +++ b/src/core/CL/kernels/CLHistogramKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLHistogramKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLDistribution1D.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstring> +#include <string> + +using namespace arm_compute; + +// each thread handle 16 pixels +constexpr signed int pixels_per_item = 16; + +// local work group size in X dimension +constexpr unsigned int local_x_size = 16; + +CLHistogramKernel::CLHistogramKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + // Check input size + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + // Check offset + ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); + + // Check range + ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); + + _input = input; + _output = output; + + if(_input->info()->dimension(0) < pixels_per_item) + { + return; + } + + unsigned int num_bins = _output->num_bins(); + unsigned int window_size = _output->window(); + unsigned int offset = _output->offset(); + unsigned int range = _output->range(); + unsigned int offrange = offset + range; + unsigned int bin_size = _output->size(); + unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions + + // Create kernel + bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); + std::string kernel_name = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, buffer_size, nullptr); + _kernel.setArg(idx++, _output->cl_buffer()); + if(!is_fixed_size) + { + _kernel.setArg<cl_uint>(idx++, num_bins); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg<cl_uint>(idx++, range); + _kernel.setArg<cl_uint>(idx++, offrange); + } + + // We only run histogram on Image, therefore only 2 dimensions here + unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item; + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, end_position, pixels_per_item)); + win.set(1, Window::Dimension(0, _input->info()->dimension(1))); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item)); + + ICLKernel::configure(win); +} + +void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + if(_input->info()->dimension(0) < pixels_per_item) + { + return; + } + + _output->map(queue, true); + ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); + memset(_output->buffer(), 0, _output->size()); + _output->unmap(queue); + + Window slice = window.first_slice_window_2D(); + cl::NDRange lws = cl::NDRange(local_x_size, 1); + + do + { + /* Run the core part which has width can be divided by 16 */ + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice, lws); + } + while(window.slide_window_slice_2D(slice)); +} + +CLHistogramBorderKernel::CLHistogramBorderKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + // Check input size + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + // Check offset + ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); + + // Check range + ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); + + // We only run histogram on Image, therefore only 2 dimensions here + unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item; + + if(start_position >= input->info()->dimension(0)) + { + return; // no need to run histogram border kernel + } + + _input = input; + _output = output; + + unsigned int num_bins = _output->num_bins(); + unsigned int window_size = _output->window(); + unsigned int offset = _output->offset(); + unsigned int range = _output->range(); + unsigned int offrange = offset + range; + + // Create kernel + bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); + std::string kernel_name = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, _output->cl_buffer()); + if(!is_fixed_size) + { + _kernel.setArg<cl_uint>(idx++, num_bins); + _kernel.setArg<cl_uint>(idx++, offset); + _kernel.setArg<cl_uint>(idx++, range); + _kernel.setArg<cl_uint>(idx++, offrange); + } + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(start_position, _input->info()->dimension(0))); + win.set(1, Window::Dimension(0, _input->info()->dimension(1))); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1)); + ICLKernel::configure(win); +} + +void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue) +{ + if(window.x().start() >= window.x().end()) + { + return; + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + cl::NDRange lws = cl::NDRange(1, 1); + + Window slice = window.first_slice_window_2D(); + + do + { + /* Run the border part which has width cannot be divided by 16 */ + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + enqueue(queue, *this, slice, lws); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp new file mode 100644 index 0000000000..8c0fe26666 --- /dev/null +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cmath> +#include <tuple> + +using namespace arm_compute; + +CLIm2ColKernel::CLIm2ColKernel() + : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr) +{ +} + +void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + + _input = input; + _output = output; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace((has_bias ? "-DHAS_BIAS" : "")); + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = conv_info.pad(); + std::tie(stride_x, stride_y) = conv_info.stride(); + + const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4) + && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)) + && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0)); + + if(!run_img2col_reduced) + { + _convolved_dims = convolved_dims; + _conv_info = conv_info; + _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2)); + _num_elems_processed_per_iteration = output->info()->dimension(0); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts)); + + // Create static kernel arguments + const cl_int2 input_dims = + { + { + static_cast<cl_int>(input->info()->dimension(0)), + static_cast<cl_int>(input->info()->dimension(1)), + } + }; + const cl_int2 strides = + { + { + stride_x, + stride_y, + } + }; + const cl_int2 paddings = + { + { + pad_x, + pad_y, + } + }; + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + _kernel.setArg<cl_int>(idx++, _kernel_size); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */); + _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */); + _kernel.setArg<cl_int2>(idx++, input_dims); + _kernel.setArg<cl_int2>(idx++, strides); + _kernel.setArg<cl_int2>(idx++, paddings); + + _run_func = &CLIm2ColKernel::run_generic; + } + else + { + _num_elems_processed_per_iteration = 1; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts)); + _run_func = &CLIm2ColKernel::run_reduced; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON(_run_func == nullptr); + (this->*_run_func)(window, queue); +} + +void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = _conv_info.pad(); + std::tie(stride_x, stride_y) = _conv_info.stride(); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + Window slice_in = window.first_slice_window_3D(); + Window slice_out = window.first_slice_window_3D(); + + // Setup slice + slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1)); + slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1)); + slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup input slice + // The first three dimensions of the input are increased by the inner loops + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup output slice + slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration)); + slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1)); + slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in)); +} + +void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window out_slice = out_window.first_slice_window_1D(); + Window in_slice = window.first_slice_window_3D(); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1)); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); +} diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp new file mode 100644 index 0000000000..69ede457df --- /dev/null +++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cstddef> + +using namespace arm_compute; + +void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal")); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); + const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +CLIntegralImageVertKernel::CLIntegralImageVertKernel() + : _in_out(nullptr) +{ +} + +void CLIntegralImageVertKernel::configure(ICLTensor *in_out) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32); + + _in_out = in_out; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical")); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + const unsigned int num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY); + + Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, in_out_access); + + in_out_access.set_valid_region(win, in_out->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const size_t height = _in_out->info()->dimension(1); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _in_out, slice); + _kernel.setArg<cl_uint>(idx++, height); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp new file mode 100644 index 0000000000..12cdd0ec93 --- /dev/null +++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLArray.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> + +using namespace arm_compute; + +void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, + ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal, + bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale) + +{ + ARM_COMPUTE_ERROR_ON(old_points == nullptr); + ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + + const float scale = std::pow(pyramid_scale, level); + + // Create kernel + std::string kernel_name = "init_level"; + if(level == (num_levels - 1)) + { + kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max"); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set static kernel arguments + unsigned int idx = 0; + if(level == (num_levels - 1)) + { + _kernel.setArg(idx++, old_points->cl_buffer()); + if(use_initial_estimate) + { + _kernel.setArg(idx++, new_points_estimates->cl_buffer()); + } + } + _kernel.setArg(idx++, old_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg<cl_float>(idx++, scale); + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + ICLKernel::configure(window); +} + +void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + enqueue(queue, *this, window); +} + +void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points) + +{ + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points == nullptr); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize")); + + // Set static kernel arguments + unsigned int idx = 0; + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points->cl_buffer()); + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + ICLKernel::configure(window); +} + +void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + enqueue(queue, *this, window); +} + +CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel() + : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr) +{ +} + +void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy, + ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal, + ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, + size_t window_dimension, size_t level) + +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(coeff_table == nullptr); + ARM_COMPUTE_ERROR_ON(old_ival == nullptr); + + _old_input = old_input; + _old_scharr_gx = old_scharr_gx; + _old_scharr_gy = old_scharr_gy; + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const ValidRegion valid_region = intersect_valid_regions( + old_input->info()->valid_region(), + old_scharr_gx->info()->valid_region(), + old_scharr_gy->info()->valid_region()); + + update_window_and_padding(window, + AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1)), + AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1)), + AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1))); + + ICLKernel::configure(window); + + // Initialize required variables + const int level0 = (level == 0) ? 1 : 0; + const int window_size = window_dimension; + const int window_size_squared = window_dimension * window_dimension; + const int window_size_half = window_dimension / 2; + const float eig_const = 1.0f / (2.0f * window_size_squared); + const cl_float3 border_limits = + { + { + // -1 because we load 2 values at once for bilinear interpolation + static_cast<cl_float>(valid_region.end(0) - window_size - 1), + static_cast<cl_float>(valid_region.end(1) - window_size - 1), + static_cast<cl_float>(valid_region.start(0)) + } + }; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0")); + + // Set arguments + unsigned int idx = 3 * num_arguments_per_2D_tensor(); + _kernel.setArg(idx++, old_points_internal->cl_buffer()); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, coeff_table->cl_buffer()); + _kernel.setArg(idx++, old_ival->cl_buffer()); + _kernel.setArg<cl_int>(idx++, window_size); + _kernel.setArg<cl_int>(idx++, window_size_squared); + _kernel.setArg<cl_int>(idx++, window_size_half); + _kernel.setArg<cl_float3>(idx++, border_limits); + _kernel.setArg<cl_float>(idx++, eig_const); + _kernel.setArg<cl_int>(idx++, level0); +} + +void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Set static tensor arguments. Setting here as allocation might be deferred. + unsigned int idx = 0; + add_2D_tensor_argument(idx, _old_input, window); + add_2D_tensor_argument(idx, _old_scharr_gx, window); + add_2D_tensor_argument(idx, _old_scharr_gy, window); + + enqueue(queue, *this, window); +} + +CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel() + : _new_input(nullptr) +{ +} + +void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, + Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level) + +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr); + ARM_COMPUTE_ERROR_ON(coeff_table == nullptr); + ARM_COMPUTE_ERROR_ON(old_ival == nullptr); + + _new_input = new_input; + + // Configure kernel window + Window window; + window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1)); + window.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const ValidRegion &valid_region = new_input->info()->valid_region(); + + update_window_and_padding(window, + AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1), + valid_region.end(0), valid_region.end(1))); + + ICLKernel::configure(window); + + // Initialize required variables + const int level0 = (level == 0) ? 1 : 0; + const int window_size = window_dimension; + const int window_size_squared = window_dimension * window_dimension; + const int window_size_half = window_dimension / 2; + const float eig_const = 1.0f / (2.0f * window_size_squared); + const cl_float3 border_limits = + { + { + // -1 because we load 2 values at once for bilinear interpolation + static_cast<cl_float>(valid_region.end(0) - window_size - 1), + static_cast<cl_float>(valid_region.end(1) - window_size - 1), + static_cast<cl_float>(valid_region.start(0)) + } + }; + const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0; + const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1")); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor(); + _kernel.setArg(idx++, new_points_internal->cl_buffer()); + _kernel.setArg(idx++, coeff_table->cl_buffer()); + _kernel.setArg(idx++, old_ival->cl_buffer()); + _kernel.setArg<cl_int>(idx++, window_size); + _kernel.setArg<cl_int>(idx++, window_size_squared); + _kernel.setArg<cl_int>(idx++, window_size_half); + _kernel.setArg<cl_int>(idx++, num_iterations); + _kernel.setArg<cl_float>(idx++, epsilon); + _kernel.setArg<cl_float3>(idx++, border_limits); + _kernel.setArg<cl_float>(idx++, eig_const); + _kernel.setArg<cl_int>(idx++, level0); + _kernel.setArg<cl_int>(idx++, term_iteration); + _kernel.setArg<cl_int>(idx++, term_epsilon); +} + +void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Set static tensor arguments. Setting here as allocation might be deferred. + unsigned int idx = 0; + add_2D_tensor_argument(idx, _new_input, window); + + enqueue(queue, *this, window); +} diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..794a1bc56e --- /dev/null +++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + + _input0 = input0; + _input1 = input1; + _output = output; + + if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } + + std::ostringstream mm_arguments; + std::set<std::string> build_opts; + + mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " "; + build_opts.emplace(mm_arguments.str()); + + // Create kernel + std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts)); + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + Window matrix_b_window; + matrix_b_window.use_tensor_dimensions(_input1->info()); + Window slice_matrix_b = matrix_b_window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input0, slice); + add_3D_tensor_argument(idx, _input1, slice_matrix_b); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp new file mode 100644 index 0000000000..c504189169 --- /dev/null +++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLMagnitudePhaseKernel::CLMagnitudePhaseKernel() + : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false) +{ +} + +void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, + MagnitudeType mag_type, PhaseType phase_type) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy); + + _run_mag = (magnitude != nullptr); + _run_phase = (phase != nullptr); + if(_run_mag) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude); + } + if(_run_phase) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + } + + if(!_run_mag && !_run_phase) + { + ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); + } + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + // Construct kernel name + std::set<std::string> build_opts = {}; + + // Add magnitude type + if(_run_mag) + { + switch(mag_type) + { + case MagnitudeType::L1NORM: + build_opts.insert("-DMAGNITUDE=1"); + break; + case MagnitudeType::L2NORM: + build_opts.insert("-DMAGNITUDE=2"); + break; + default: + ARM_COMPUTE_ERROR("Unsupported magnitude calculation type."); + build_opts.insert("-DMAGNITUDE=0"); + break; + } + } + + // Add phase type + if(_run_phase) + { + switch(phase_type) + { + case PhaseType::UNSIGNED: + build_opts.insert("-DPHASE=1"); + break; + case PhaseType::SIGNED: + build_opts.insert("-DPHASE=2"); + break; + default: + ARM_COMPUTE_ERROR("Unsupported phase calculation type."); + build_opts.insert("-DPHASE=0"); + break; + } + } + + // Add data_type + build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + gx_access, gy_access, + output_magnitude_access, output_phase_access); + + ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(), + gy->info()->valid_region()); + output_magnitude_access.set_valid_region(win, valid_region); + output_phase_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _gx, slice); + add_2D_tensor_argument(idx, _gy, slice); + + if(_run_mag) + { + add_2D_tensor_argument(idx, _magnitude, slice); + } + + if(_run_phase) + { + add_2D_tensor_argument(idx, _phase, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp new file mode 100644 index 0000000000..b0b748f466 --- /dev/null +++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <set> +#include <string> + +using namespace arm_compute; + +CLMeanStdDevKernel::CLMeanStdDevKernel() + : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr) +{ +} + +void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == mean); + ARM_COMPUTE_ERROR_ON(nullptr == global_sum); + ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared); + + _input = input; + _mean = mean; + _stddev = stddev; + _global_sum = global_sum; + _global_sum_squared = global_sum_squared; + + // Create kernel + std::set<std::string> build_opts; + + if(_stddev != nullptr) + { + build_opts.insert("-DSTDDEV"); + } + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts)); + + // Set fixed arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters + + _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1))); + _kernel.setArg(idx++, *_global_sum); + + if(_stddev != nullptr) + { + _kernel.setArg(idx++, *_global_sum_squared); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + update_window_and_padding(win, input_access); + + ICLKernel::configure(win); +} + +void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Clear sums + static const cl_ulong zero = 0; + queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero); + + if(_stddev != nullptr) + { + queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero); + } + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + // Set slice step equal to height to force gws[1] to 1, + // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item + slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); + + // Calculate mean and stddev + cl_ulong global_sum = 0; + cl_ulong global_sum_squared = 0; + const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1); + + queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum)); + const float mean = global_sum / num_pixels; + *_mean = mean; + + if(_stddev != nullptr) + { + queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared)); + *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean)); + } +} diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp new file mode 100644 index 0000000000..95334c7b5f --- /dev/null +++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +BorderSize CLMedian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" })); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp new file mode 100644 index 0000000000..939a53b03a --- /dev/null +++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <climits> + +using namespace arm_compute; + +CLMinMaxKernel::CLMinMaxKernel() + : _input(nullptr), _min_max(), _data_type_max_min() +{ +} + +void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(min_max == nullptr); + + _input = input; + _min_max = min_max; + const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); + + switch(input->info()->data_type()) + { + case DataType::U8: + _data_type_max_min[0] = UCHAR_MAX; + _data_type_max_min[1] = 0; + break; + case DataType::S16: + _data_type_max_min[0] = SHRT_MAX; + _data_type_max_min[1] = SHRT_MIN; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong image data types"); + } + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0])); + build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1])); + build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : ""); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts)); + + // Set fixed arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, *_min_max); + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Reset mininum and maximum values + queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data()); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLMinMaxLocationKernel::CLMinMaxLocationKernel() + : _input(nullptr), _min_max_count(nullptr) +{ +} + +void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(min_max == nullptr); + ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr); + + _input = input; + _min_max_count = min_max_count; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : ""); + build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : ""); + build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : ""); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts)); + + // Set static arguments + unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, *min_max); + _kernel.setArg(idx++, *min_max_count); + if(min_loc != nullptr) + { + _kernel.setArg(idx++, min_loc->cl_buffer()); + _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values()); + } + if(max_loc != nullptr) + { + _kernel.setArg(idx++, max_loc->cl_buffer()); + _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values()); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + ICLKernel::configure(win); +} + +void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + static const unsigned int zero_count = 0; + queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count); + queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp new file mode 100644 index 0000000000..6afa5822ba --- /dev/null +++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +CLNonLinearFilterKernel::CLNonLinearFilterKernel() + : _border_size(0) +{ +} + +BorderSize CLNonLinearFilterKernel::border_size() const +{ + return _border_size; +} + +void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, + unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5); + ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!"); + ARM_COMPUTE_UNUSED(mask); + + _input = input; + _output = output; + _border_size = BorderSize(mask_size / 2); + + // Define build options + std::set<std::string> build_opts; + build_opts.emplace("-D" + string_from_non_linear_filter_function(function)); + + // Define kernel + std::string pattern_name = string_from_matrix_pattern(pattern); + std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower); + std::stringstream ss; + ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size; + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + const unsigned int num_rows_read_per_iteration = mask_size; + + Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp new file mode 100644 index 0000000000..6a96b0effd --- /dev/null +++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); + + _input = input; + _output = output; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..106a5113db --- /dev/null +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLNormalizationLayerKernel::CLNormalizationLayerKernel() + : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0) +{ +} + +BorderSize CLNormalizationLayerKernel::border_size() const +{ + return _border_size; +} + +void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); + ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented"); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + _input = input; + _squared_input = squared_input; + _output = output; + + const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D); + const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0; + _border_size = BorderSize(0, border_width); + + // Create kernel + std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set kernel static arguments + unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff()); + _kernel.setArg<cl_float>(idx++, norm_info.beta()); + _kernel.setArg<cl_float>(idx++, norm_info.kappa()); + _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1; + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, squared_input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _squared_input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp new file mode 100644 index 0000000000..84eb434bc9 --- /dev/null +++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstdlib> +#include <set> +#include <string> + +using namespace arm_compute; + +CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. "); + + _input1 = input1; + _input2 = input2; + _output = output; + + int scale_int = -1; + // Extract sign, exponent and mantissa + int exponent = 0; + float normalized_mantissa = std::frexp(scale, &exponent); + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + scale_int = std::abs(exponent - 1); + } + + std::string data_type; + std::string compute_type; + // Check if it has float inputs and output + if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type())) + { + scale_int = -1; + compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half"; + data_type = "DATA_TYPE_FLOAT"; + } + else + { + compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort"; + data_type = "DATA_TYPE_INT"; + } + + // Construct kernel name + std::string kernel_name = "pixelwise_mul"; + kernel_name += (scale_int >= 0) ? "_int" : "_float"; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); + build_opts.emplace("-D" + data_type); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set scale argument + unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters + + if(scale_int >= 0) + { + _kernel.setArg(idx++, scale_int); + } + else + { + _kernel.setArg(idx++, scale); + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure(win); +} + +void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, slice); + add_2D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp new file mode 100644 index 0000000000..dc5ae4ec7a --- /dev/null +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> +#include <tuple> + +using namespace arm_compute; + +CLPoolingLayerKernel::CLPoolingLayerKernel() + : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0) +{ +} + +BorderSize CLPoolingLayerKernel::border_size() const +{ + return _border_size; +} + +void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info) +{ + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + const PoolingType pool_type = pool_info.pool_type(); + const int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + DimensionRoundingType pool_round = pad_stride_info.round(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size); + ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), + input->info()->dimension(1), + pool_size, + pool_stride_x, pool_stride_y, + pool_pad_x, pool_pad_y, + pool_round); + ARM_COMPUTE_UNUSED(pooled_w); + ARM_COMPUTE_UNUSED(pooled_h); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); + + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + // Set instance variables + _input = input; + _output = output; + _pool_info = pool_info; + _border_size = BorderSize(pool_pad_y, pool_pad_x); + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG")))); + + // Create kernel + std::string kernel_name = "pooling_layer_" + val_to_string(pool_size); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Set static kernel arguments + if(pool_type == PoolingType::AVG) + { + // Create static kernel arguments + const cl_int2 max_dims = + { + { + static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x, + static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y, + } + }; + const cl_int2 strides = + { + { + pool_stride_x, + pool_stride_y, + } + }; + const cl_int2 paddings = + { + { + pool_pad_x, + pool_pad_y, + } + }; + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + _kernel.setArg<cl_int2>(idx++, max_dims); + _kernel.setArg<cl_int2>(idx++, strides); + _kernel.setArg<cl_int2>(idx++, paddings); + } + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + Window slice = window.first_slice_window_3D(); + + do + { + // Upsample input by pool size + Window in_slice(slice); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x)); + in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y)); + + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp new file mode 100644 index 0000000000..e63a5ef7c6 --- /dev/null +++ b/src/core/CL/kernels/CLRemapKernel.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLRemapKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <algorithm> + +using namespace arm_compute; + +CLRemapKernel::CLRemapKernel() + : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr) +{ +} + +BorderSize CLRemapKernel::border_size() const +{ + return BorderSize(1); +} + +void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!"); + + _input = input; + _output = output; + _map_x = map_x; + _map_y = map_y; + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "remap_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 4; + const int border_offset = (border_undefined) ? 0 : border_size().left; + + Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowStatic input_access(output->info(), -border_offset, -border_offset, + _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset); + AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); + + // Set static arguments + unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_float>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_float>(idx++, input->info()->dimension(1)); +} + +void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _map_x, slice); + add_2D_tensor_argument(idx, _map_y, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp new file mode 100644 index 0000000000..d74e837ace --- /dev/null +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLScaleKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +BorderSize CLScaleKernel::border_size() const +{ + return BorderSize(1); +} + +void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + /* Compute the ratio between source width/height and destination width/height */ + const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0)); + const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1)); + + /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */ + if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + { + policy = InterpolationPolicy::NEAREST_NEIGHBOR; + } + else + { + ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA); + } + + // Create kernel + std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "scale_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + const int border_offset = (border_undefined) ? 0 : border_size().left; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input->info(), -border_offset, -border_offset, + input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<float>(idx++, input->info()->dimension(0)); + _kernel.setArg<float>(idx++, input->info()->dimension(1)); + _kernel.setArg<float>(idx++, output->info()->dimension(0)); + _kernel.setArg<float>(idx++, output->info()->dimension(1)); +} diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp new file mode 100644 index 0000000000..913ef592d4 --- /dev/null +++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLScharr3x3Kernel::CLScharr3x3Kernel() + : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr) +{ +} + +BorderSize CLScharr3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_scharr_x = output_x != nullptr; + _run_scharr_y = output_y != nullptr; + + if(_run_scharr_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_scharr_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_scharr_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_scharr_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_scharr_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_scharr_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp new file mode 100644 index 0000000000..436aaa498a --- /dev/null +++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel3x3Kernel::CLSobel3x3Kernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp new file mode 100644 index 0000000000..4c0316f19e --- /dev/null +++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel5x5HorKernel::CLSobel5x5HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize CLSobel5x5HorKernel::border_size() const +{ + return _border_size; +} + +void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLSobel5x5VertKernel::CLSobel5x5VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel5x5VertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts)); + + const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 5; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _input_x, slice); + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _input_y, slice); + add_2D_tensor_argument(idx, _output_y, slice); + } + + _kernel.setArg(idx++, 0 /*dummy*/); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp new file mode 100644 index 0000000000..a477953cfb --- /dev/null +++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +CLSobel7x7HorKernel::CLSobel7x7HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize CLSobel7x7HorKernel::border_size() const +{ + return _border_size; +} + +void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 3, 3); + + // Construct kernel name + std::string kernel_name = "sobel_separable1x7"; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _output_y, slice); + } + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLSobel7x7VertKernel::CLSobel7x7VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize CLSobel7x7VertKernel::border_size() const +{ + return BorderSize(3, 0); +} + +void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + // Set build options + std::set<std::string> build_opts; + + if(_run_sobel_x) + { + build_opts.insert("-DGRAD_X"); + } + + if(_run_sobel_y) + { + build_opts.insert("-DGRAD_Y"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts)); + + const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 7; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + ICLKernel::configure(win); +} + +void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + + if(_run_sobel_x) + { + add_2D_tensor_argument(idx, _input_x, slice); + add_2D_tensor_argument(idx, _output_x, slice); + } + + if(_run_sobel_y) + { + add_2D_tensor_argument(idx, _input_y, slice); + add_2D_tensor_argument(idx, _output_y, slice); + } + + _kernel.setArg(idx++, 0 /*dummy*/); + + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp new file mode 100644 index 0000000000..0470d5243e --- /dev/null +++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <set> +#include <string> + +using namespace arm_compute; + +void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + // The kernel loops over all elements in steps of 16 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16); + + // Set build options + std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) }; + + // Tell the kernel that the width is not a multiple of 16 + if((input->info()->dimension(0) % max_cl_vector_width) != 0) + { + build_opts.emplace("-DNON_MULTIPLE_OF_16"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts)); + + // Set fixed arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure kernel window + constexpr unsigned int num_elems_written_per_iteration = 1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel() + : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr) +{ +} + +void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum); + + _input = input; + _max = max; + _output = output; + _sum = sum; + + // The kernel loops over all elements in steps of 16 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16); + + // Set build options + std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) }; + + // Tell the kernel that the width is not a multiple of 16 + if((input->info()->dimension(0) % max_cl_vector_width) != 0) + { + build_opts.emplace("-DNON_MULTIPLE_OF_16"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts)); + + // Set fixed arguments + unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal max_access(max->info(), 0, 1); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal sum_access(sum->info(), 0, 1); + + update_window_and_padding(win, input_access, max_access, output_access, sum_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _max, slice); + add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, _sum, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} + +CLLogits1DNormKernel::CLLogits1DNormKernel() + : _input(nullptr), _sum(nullptr), _output(nullptr) +{ +} + +void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum); + + _input = input; + _sum = sum; + _output = output; + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type()))); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + + do + { + Window sum_slice = slice; + sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_2D_tensor_argument(idx, _sum, sum_slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp new file mode 100644 index 0000000000..bbdaa37410 --- /dev/null +++ b/src/core/CL/kernels/CLTableLookupKernel.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLLut.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include <cstdint> +#include <string> + +using namespace arm_compute; + +void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(lut == nullptr); + ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + // Create kernel + std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set lut argument + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, lut->cl_buffer()); + if(DataType::S16 == lut->type()) + { + _kernel.setArg(idx++, lut->index_offset()); + _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements())); + } + + // Configure kernel + constexpr unsigned int num_elems_processed_per_iteration = 8; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp new file mode 100644 index 0000000000..6e07cefc77 --- /dev/null +++ b/src/core/CL/kernels/CLThresholdKernel.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLThresholdKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <string> + +using namespace arm_compute; + +void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, + uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + // Construct kernel name + std::string kernel_name = "threshold"; + + switch(type) + { + case ThresholdType::BINARY: + kernel_name += "_binary"; + break; + case ThresholdType::RANGE: + kernel_name += "_range"; + break; + default: + ARM_COMPUTE_ERROR("Thresholding type not recognized"); + break; + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + + // Set arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg(idx++, false_value); + _kernel.setArg(idx++, true_value); + _kernel.setArg(idx++, threshold); + + if(ThresholdType::RANGE == type) + { + _kernel.setArg(idx++, upper); + } + + // Make sure _kernel is initialized before calling the parent's configure + constexpr unsigned int num_elems_processed_per_iteration = 16; + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); +} diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp new file mode 100644 index 0000000000..2ee6fcb9dc --- /dev/null +++ b/src/core/CL/kernels/CLTransposeKernel.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" + +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t w_out = input->info()->dimension(1); + const size_t h_out = input->info()->dimension(0); + output_shape.set(0, w_out); + output_shape.set(1, h_out); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + _lws_hint = cl::NDRange(2, 8); + + std::set<std::string> build_opts; + std::ostringstream data_type_in_bytes; + data_type_in_bytes << input->info()->element_size(); + build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str()); + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size(); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp new file mode 100644 index 0000000000..e549dbc258 --- /dev/null +++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +namespace +{ +void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size) +{ + for(size_t i = 0; i < size; ++i) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << matrix[i] << " "; + options.insert(mat_str.str()); + } +} +} // namespace + +BorderSize CLWarpAffineKernel::border_size() const +{ + return BorderSize(1); +} + +void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> options; + options_add_matrix(options, matrix, 6); + options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + // Create kernel + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "warp_affine_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_int>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(1)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1)); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp new file mode 100644 index 0000000000..fddb580750 --- /dev/null +++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <cstddef> +#include <set> +#include <sstream> +#include <string> + +using namespace arm_compute; + +namespace +{ +inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size) +{ + for(size_t i = 0; i < size; ++i) + { + std::stringstream mat_str; + mat_str << "-DMAT" << i << "=" << matrix[i] << " "; + options.insert(mat_str.str()); + } +} +} // namespace + +BorderSize CLWarpPerspectiveKernel::border_size() const +{ + return BorderSize(1); +} + +void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); + + _input = input; + _output = output; + + // Create build options + std::set<std::string> options; + options_add_matrix(options, matrix, 9); + options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + + // Create kernel + std::string interpolation_name = string_from_interpolation_policy(policy); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "warp_perspective_" + interpolation_name; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options)); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + _kernel.setArg<cl_int>(idx++, input->info()->dimension(0)); + _kernel.setArg<cl_int>(idx++, input->info()->dimension(1)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1)); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..018f272921 --- /dev/null +++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared) + : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr) +{ +} + +void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + if(_is_shared) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2))); + ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5); + ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3); + } + else + { + ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2); + } + + // Check biases + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); + } + + _biases = biases; + _output = output; + _input = input; + + // Create build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : "")); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts)); + + // Set static arguments + unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); + idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0; + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2)); + _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + ICLKernel::configure(win); +} + +CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel() + : CLWeightsReshapeKernel(false) +{ +} + +void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + if(_biases != nullptr) + { + Window biases_slice; + biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1)); + add_1D_tensor_argument(idx, _biases, biases_slice); + } + + // Run kernel + enqueue(queue, *this, in_slice); +} + +CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel() + : CLWeightsReshapeKernel(true) +{ +} + +void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + Window biases_window; + Window biases_slice; + + if(_biases != nullptr) + { + biases_window.use_tensor_dimensions(_biases->info()); + biases_slice = biases_window.first_slice_window_1D(); + } + + do + { + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + if(_biases != nullptr) + { + add_1D_tensor_argument(idx, _biases, biases_slice); + biases_window.slide_window_slice_1D(biases_slice); + } + + // Run kernel + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} |