diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/core/CL | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core/CL')
109 files changed, 3278 insertions, 2239 deletions
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp index 7fcfdf2c89..d094dcdaea 100644 --- a/src/core/CL/CLCommandBuffer.cpp +++ b/src/core/CL/CLCommandBuffer.cpp @@ -38,7 +38,7 @@ std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue) const auto &cl_device = CLKernelLibrary::get().get_device(); const auto has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device); - if(has_mutable_dispatch) + if (has_mutable_dispatch) { return std::make_unique<CLMutableCommandBuffer>(queue); } diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h index 8a94e389fa..90e434161e 100644 --- a/src/core/CL/CLCommandBuffer.h +++ b/src/core/CL/CLCommandBuffer.h @@ -87,7 +87,8 @@ public: * @param[in] global The global work size. * @param[in] local The local work size. */ - virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; + virtual void + add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; /** Add the mutable argument to the current kernel enqueue command. * @@ -154,7 +155,7 @@ protected: CLCommandBuffer &state(State state); private: - State _state{ State::Created }; + State _state{State::Created}; }; } // namespace arm_compute diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp index f1a902c7b9..242fd7719c 100644 --- a/src/core/CL/CLCompatCommandBuffer.cpp +++ b/src/core/CL/CLCompatCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) - : _queue(queue) +CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue) { } @@ -40,11 +39,14 @@ CLCompatCommandBuffer::~CLCompatCommandBuffer() { } -void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); - _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} }); + _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}}); } void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) @@ -52,7 +54,7 @@ void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const ARM_COMPUTE_ERROR_ON(state() != State::Created); ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty()); - _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value }); + _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value}); } void CLCompatCommandBuffer::finalize() @@ -61,7 +63,7 @@ void CLCompatCommandBuffer::finalize() _kernel_cmds.shrink_to_fit(); - for(auto &cmd : _kernel_cmds) + for (auto &cmd : _kernel_cmds) { cmd.mutable_args.shrink_to_fit(); } @@ -80,25 +82,19 @@ void CLCompatCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - for(const auto &cmd : _kernel_cmds) + for (const auto &cmd : _kernel_cmds) { - for(const auto &arg : cmd.mutable_args) + for (const auto &arg : cmd.mutable_args) { const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value); handle_cl_error("clSetKernelArg", error); } - const auto error = clEnqueueNDRangeKernel( - _queue, - cmd.kernel, - static_cast<cl_uint>(cmd.global.dimensions()), - cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, - cmd.global.get(), - cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, - 0, - nullptr, - nullptr); + const auto error = + clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()), + cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(), + cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr); handle_cl_error("clEnqueueNDRangeKernel", error); } diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h index e91d52d2d6..d5df106425 100644 --- a/src/core/CL/CLCompatCommandBuffer.h +++ b/src/core/CL/CLCompatCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 2d024f9c2f..9bbc32657e 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -22,19 +22,19 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" + #include "support/StringSupport.h" #include <regex> namespace arm_compute { -CLBuildOptions::CLBuildOptions() - : _build_opts() +CLBuildOptions::CLBuildOptions() : _build_opts() { } @@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option) void CLBuildOptions::add_option_if(bool cond, std::string option) { - if(cond) + if (cond) { add_option(std::move(option)); } @@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options) void CLBuildOptions::add_options_if(bool cond, const StringSet &options) { - if(cond) + if (cond) { add_options(options); } @@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const return _build_opts == other._build_opts; } -Program::Program() - : _context(), _device(), _is_binary(false), _name(), _source(), _binary() +Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary() { } Program::Program(cl::Context context, std::string name, std::string source) - : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary() + : _context(std::move(context)), + _device(), + _is_binary(false), + _name(std::move(name)), + _source(std::move(source)), + _binary() { } Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary) - : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary)) + : _context(std::move(context)), + _device(std::move(device)), + _is_binary(true), + _name(std::move(name)), + _source(), + _binary(std::move(binary)) { } Program::operator cl::Program() const { - if(_is_binary) + if (_is_binary) { - return cl::Program(_context, { _device }, { _binary }); + return cl::Program(_context, {_device}, {_binary}); } else { @@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options { return program.build(build_options.c_str()) == CL_SUCCESS; } - catch(const cl::Error &e) + catch (const cl::Error &e) { cl_int err = CL_SUCCESS; const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err); - for(auto &pair : build_info) + for (auto &pair : build_info) { std::cerr << pair.second << std::endl; } @@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const return cl_program; } -Kernel::Kernel() - : _name(), _kernel() +Kernel::Kernel() : _name(), _kernel() { } Kernel::Kernel(std::string name, const cl::Program &program) - : _name(std::move(name)), - _kernel(cl::Kernel(program, _name.c_str())) + : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str())) { } CLCompileContext::CLCompileContext() @@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device _is_wbsm_supported = get_wbsm_support_info(device); } -Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, - const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const +Kernel CLCompileContext::create_kernel(const std::string &kernel_name, + const std::string &program_name, + const std::string &program_source, + const std::string &kernel_path, + const StringSet &build_options_set, + bool is_binary) const { const std::string build_options = generate_build_options(build_options_set, kernel_path); const std::string built_program_name = program_name + "_" + build_options; auto built_program_it = _built_programs_map.find(built_program_name); cl::Program cl_program; - if(_built_programs_map.end() != built_program_it) + if (_built_programs_map.end() != built_program_it) { // If program has been built, retrieve to create kernel from it cl_program = built_program_it->second; @@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std return Kernel(kernel_name, cl_program); } -const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const +const Program & +CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const { const auto program_it = _programs_map.find(program_name); - if(program_it != _programs_map.end()) + if (program_it != _programs_map.end()) { return program_it->second; } @@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c ARM_COMPUTE_UNUSED(is_binary); program = Program(_context, program_name, program_source); #else /* EMBEDDED_KERNELS */ - if(is_binary) + if (is_binary) { - program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end())); + program = Program(_context, _device.cl_device(), program_name, + std::vector<unsigned char>(program_source.begin(), program_source.end())); } else { @@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c void CLCompileContext::set_context(cl::Context context) { _context = std::move(context); - if(_context.get() != nullptr) + if (_context.get() != nullptr) { const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); - if(!cl_devices.empty()) + if (!cl_devices.empty()) { _device = CLDevice(cl_devices[0]); } } } -std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const +std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, + const std::string &kernel_path) const { std::string concat_str; bool ext_supported = false; @@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) GPUTarget gpu_arch = get_arch_from_target(_device.target()); - concat_str += " -DGPU_ARCH=" + support::cpp11::to_string( - static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch)); + concat_str += + " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch)); - if(_device.supported("cl_khr_fp16")) + if (_device.supported("cl_khr_fp16")) { concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) + if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_accumulate_int8")) + if (_device.supported("cl_arm_integer_dot_product_accumulate_int8")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 "; } std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported(); - if(ext_supported) + if (ext_supported) { concat_str += ext_buildopts; } @@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); } - if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) + if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) { concat_str += " -DUNROLL_WITH_PRAGMA "; } @@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin #endif /* EMBEDDED_KERNELS */ // Concatenate set - for(const auto &el : s) + for (const auto &el : s) { concat_set += " " + el; } @@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const GPUTarget _target = get_target_from_device(_device.cl_device()); cl::NDRange default_range; - switch(_target) + switch (_target) { case GPUTarget::MIDGARD: case GPUTarget::T600: @@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons size_t result; size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; @@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const const std::regex ddk_regex("r([0-9]*)p[0-9]"); std::smatch ddk_match; - if(std::regex_search(device_version, ddk_match, ddk_regex)) + if (std::regex_search(device_version, ddk_match, ddk_regex)) { return std::stoi(ddk_match[1]); } diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 78f36100d5..5ea99d360a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" -#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Types.h" -#include "src/gpu/cl/ClCompileContext.h" +#include "arm_compute/core/utils/DataTypeUtils.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClKernelLibrary.h" #include <utility> @@ -39,7 +40,7 @@ namespace arm_compute { std::string get_cl_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt) std::string get_cl_promoted_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt) std::string get_cl_unsigned_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "uchar"; @@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size) std::string get_cl_signed_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "char"; @@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size) std::string get_cl_select_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt) std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) std::string get_data_size_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device) const GPUTarget gpu_target = get_target_from_name(device_name); // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8 - std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 }; - return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0); + std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76}; + return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || + sw_workaround_issue.count(gpu_target) != 0); } bool dot8_acc_supported(const cl::Device &device) @@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device) CLVersion get_cl_version(const cl::Device &device) { std::string version_str = device.getInfo<CL_DEVICE_VERSION>(); - if(version_str.find("OpenCL 3") != std::string::npos) + if (version_str.find("OpenCL 3") != std::string::npos) { return CLVersion::CL30; } - else if(version_str.find("OpenCL 2") != std::string::npos) + else if (version_str.find("OpenCL 2") != std::string::npos) { return CLVersion::CL20; } - else if(version_str.find("OpenCL 1.2") != std::string::npos) + else if (version_str.find("OpenCL 1.2") != std::string::npos) { return CLVersion::CL12; } - else if(version_str.find("OpenCL 1.1") != std::string::npos) + else if (version_str.find("OpenCL 1.1") != std::string::npos) { return CLVersion::CL11; } - else if(version_str.find("OpenCL 1.0") != std::string::npos) + else if (version_str.find("OpenCL 1.0") != std::string::npos) { return CLVersion::CL10; } @@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n return (pos != std::string::npos); } -bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout) +bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, + const Size2D &kernel_size, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN); using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - std::vector<WinogradConfiguration> winograd_configs_nchw = - { + std::vector<WinogradConfiguration> winograd_configs_nchw = { WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)), @@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)), WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)), - WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)) - }; + WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))}; - std::vector<WinogradConfiguration> winograd_configs_nhwc = - { + std::vector<WinogradConfiguration> winograd_configs_nhwc = { WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)), WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)), @@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si std::pair<int, int>(kernel_size.width, kernel_size.height)); // Return true if supported - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end()); + return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != + winograd_configs_nchw.end()); } else { - return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end()); + return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != + winograd_configs_nhwc.end()); } } size_t preferred_vector_width(const cl::Device &device, const DataType dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device) cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr); - if(err == CL_SUCCESS) + if (err == CL_SUCCESS) { return pixel_aligment; } @@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device) { cl_bool supported = CL_FALSE; - cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); + cl_int err = + clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); return (err == CL_SUCCESS && supported == CL_TRUE); } -cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts) +cl::Kernel +create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts) { opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); @@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_ auto kernel_src = klib.program(program_name); const std::string kernel_path = klib.kernel_path(); - return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary)); + return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, + build_opts, kernel_src.is_binary)); } cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size) @@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio bool get_wbsm_support_info(const cl::Device &device) { cl_bitfield capabilities = 0; - cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr); - if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) + cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), + &capabilities, nullptr); + if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) { return true; } @@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device) void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint) { - cl_int err = clSetKernelExecInfo(kernel.get(), - CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, - sizeof(cl_int), - &wbsm_hint); + cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, + sizeof(cl_int), &wbsm_hint); ARM_COMPUTE_UNUSED(err); ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS); } bool export_to_cl_image(const ITensorInfo *tensor) { - if(tensor->tensor_shape()[0] % 4 != 0) + if (tensor->tensor_shape()[0] % 4 != 0) { return false; } // If not floating point - if(!is_data_type_float(tensor->data_type())) + if (!is_data_type_float(tensor->data_type())) { return false; } // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) { return false; } // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) { return false; } @@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor) const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); - if(image_w > max_image_w || image_h > max_image_h) + if (image_w > max_image_w || image_h > max_image_h) { return false; } @@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor) void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values) { - for(const int value : values) + for (const int value : values) { - if(value > max_manual_loop_unrolling) + if (value > max_manual_loop_unrolling) { built_opts.add_option("-DUNROLL_WITH_PRAGMA"); return; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index c5a0796c3a..e69d006750 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "arm_compute/core/Error.h" + #include "src/gpu/cl/ClKernelLibrary.h" + #include <algorithm> #include <array> #include <fstream> @@ -31,8 +34,7 @@ #include <vector> namespace arm_compute { -CLKernelLibrary::CLKernelLibrary() - : _compile_context() +CLKernelLibrary::CLKernelLibrary() : _compile_context() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built } @@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get() static CLKernelLibrary _kernel_library; return _kernel_library; } -Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const +Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, + const std::set<std::string> &build_options_set) const { const opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); const std::string program_name = klib.program_name(kernel_name); auto program = klib.program(program_name); const std::string &kernel_path = CLKernelLibrary::get().get_kernel_path(); - return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary); + return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, + program.is_binary); } std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const { @@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context() { return _compile_context; } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp index b9c59ac6f0..05b351fc25 100644 --- a/src/core/CL/CLMutableCommandBuffer.cpp +++ b/src/core/CL/CLMutableCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) - : CLCommandBuffer() +CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer() { cl_int status = CL_SUCCESS; @@ -52,7 +51,10 @@ CLMutableCommandBuffer::~CLMutableCommandBuffer() handle_cl_error("clReleaseCommandBufferKHR", status); } -void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); @@ -65,18 +67,8 @@ void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &off }; const auto error = clCommandNDRangeKernelKHR( - _cb, - nullptr, - properties, - kernel, - global.dimensions(), - offset.dimensions() != 0 ? offset.get() : nullptr, - global.get(), - local.dimensions() != 0 ? local.get() : nullptr, - 0, - nullptr, - nullptr, - &mutable_handle); + _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr, + global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle); handle_cl_error("clCommandNDRangeKernelKHR", error); @@ -114,7 +106,7 @@ void CLMutableCommandBuffer::finalize() size_t arg_no = 0; - for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs) + for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs) { ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size()); mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no]; @@ -132,9 +124,7 @@ void CLMutableCommandBuffer::update() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clUpdateMutableCommandsKHR( - _cb, - &_mut_cfg); + const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg); handle_cl_error("clUpdateMutableCommandsKHR", error); } @@ -143,13 +133,7 @@ void CLMutableCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clEnqueueCommandBufferKHR( - 0, - nullptr, - _cb, - 0, - nullptr, - nullptr); + const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr); handle_cl_error("clEnqueueCommandBufferKHR", error); } diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h index 04e94b0bb2..8997d7d1fd 100644 --- a/src/core/CL/CLMutableCommandBuffer.h +++ b/src/core/CL/CLMutableCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 289300b3a1..290ed32648 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "support/StringSupport.h" namespace arm_compute @@ -38,15 +39,15 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); const cl::Context &ctx = CLKernelLibrary::get().context(); - const cl::Buffer &buffer = tensor->cl_buffer(); + const cl::Buffer &buffer = tensor->cl_buffer(); const ITensorInfo *info = tensor->info(); - ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), - "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement"); + ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to " + "satisfy cl_image pitch alignment requirement"); - const size_t image_w{ info->dimension(0) / 4 }; - const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) }; - const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() }; - const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() }; + const size_t image_w{info->dimension(0) / 4}; + const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)}; + const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()}; + const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()}; ARM_COMPUTE_UNUSED(max_image_w, max_image_h); ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image"); @@ -58,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type); } -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type) +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type) { ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); - ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, - "Cannot create cl_image from empty cl_buffer"); + ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer"); cl_channel_type cl_data_type; - switch(data_type) + switch (data_type) { case DataType::F32: cl_data_type = CL_FLOAT; @@ -84,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer cl_mem cl_image; cl_int err = CL_SUCCESS; - const cl_image_format format = { CL_RGBA, cl_data_type }; + const cl_image_format format = {CL_RGBA, cl_data_type}; cl_image_desc desc; memset(&desc, 0, sizeof(desc)); @@ -94,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer desc.image_width = shape2d[0]; desc.image_height = shape2d[1]; - switch(image_type) + switch (image_type) { case CLImage2DType::ReadOnly: cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); @@ -114,7 +119,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer void handle_cl_error(const std::string &function_name, cl_int error_code) { - if(error_code != CL_SUCCESS) + if (error_code != CL_SUCCESS) { std::string error_message = function_name + " - Error code: " + std::to_string(error_code); ARM_COMPUTE_ERROR(error_message.c_str()); diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h index de9c1b3194..f9dcfeac3a 100644 --- a/src/core/CL/CLUtils.h +++ b/src/core/CL/CLUtils.h @@ -72,7 +72,12 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im * * @return cl::Image2D object */ -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type); +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type); /** Check for CL error code and throw exception accordingly. * diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h index 7b5294e452..50d224f1c0 100644 --- a/src/core/CL/CLValidate.h +++ b/src/core/CL/CLValidate.h @@ -29,11 +29,13 @@ namespace arm_compute { -#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) -#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) /** Return an error if int64_base_atomics extension is not supported by the device. * @@ -43,11 +45,13 @@ namespace arm_compute * * @return Status */ -inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) +inline arm_compute::Status +error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) { - if(!CLKernelLibrary::get().int64_base_atomics_supported()) + if (!CLKernelLibrary::get().int64_base_atomics_supported()) { - return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported"); + return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, + "Atomic functions are not supported"); } return arm_compute::Status{}; } diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp index a53fdbbab6..f96b24d2a9 100644 --- a/src/core/CL/DefaultLWSHeuristics.cpp +++ b/src/core/CL/DefaultLWSHeuristics.cpp @@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_y); - if(gws_z != 1) + if (gws_z != 1) { return cl::NDRange(4, 4, 2); } else { - if(gws_x > 256) + if (gws_x > 256) { return cl::NDRange(2, 16, 1); } @@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < gws_y) + if (gws_x < gws_y) { - if(gws_x < 4) + if (gws_x < 4) { return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1); } @@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z) ARM_COMPUTE_UNUSED(gws_y); ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < 32) + if (gws_x < 32) { return cl::NDRange(gws_x, 4, 4); } @@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) const size_t gws_y = gws[1]; const size_t gws_z = gws[2]; - switch(kernel_type) + switch (kernel_type) { case CLKernelType::GEMM: { @@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) } } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index dc3a86a528..ac53e7f1d2 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -25,18 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/Utils.h" #include <cstddef> -void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) +void arm_compute::enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint, + bool use_dummy_work_items) { - if(kernel.kernel()() == nullptr) + if (kernel.kernel()() == nullptr) { return; } - for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) + for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_ERROR_ON(window[i].step() == 0); // Make sure that dimensions > Z are 1 @@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange - if(gws.dimensions() == 0) + if (gws.dimensions() == 0) { return; } @@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind kernel.cache_gws(gws); cl::NDRange valid_lws; - if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) { valid_lws = cl::NullRange; } @@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange lws = cl::NullRange; - if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) + if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { lws = valid_lws; } - if(CLKernelLibrary::get().is_wbsm_supported()) + if (CLKernelLibrary::get().is_wbsm_supported()) { set_wbsm(kernel.kernel(), kernel.wbsm_hint()); } @@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons // Calculate offset to the start of the window unsigned int offset_first_element = info->offset_first_element_in_bytes(); - for(unsigned int n = 0; n < info->num_dimensions(); ++n) + for (unsigned int n = 0; n < info->num_dimensions(); ++n) { offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n]; } @@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons unsigned int idx_start = idx; _kernel.setArg(idx++, tensor->cl_buffer()); - for(unsigned int d = 0; d < dimension_size; ++d) + for (unsigned int d = 0; d < dimension_size; ++d) { _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]); _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step())); @@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx, - "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>()); + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_tensor<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } @@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device) size_t ICLKernel::get_max_workgroup_size() { - if(_max_workgroup_size == 0) + if (_max_workgroup_size == 0) { _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); } @@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size() cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { - if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) + if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { return cl::NullRange; } @@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); - if(use_dummy_work_items) + if (use_dummy_work_items) { gws.get()[0] = get_next_power_two(gws[0]); gws.get()[1] = get_next_power_two(gws[1]); diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index c82809cef3..6aebef15a5 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTuningParams.h" #include "src/core/CL/DefaultLWSHeuristics.h" @@ -43,14 +43,14 @@ namespace { bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1) { - if(lws0.dimensions() != lws1.dimensions()) + if (lws0.dimensions() != lws1.dimensions()) { return false; } - for(size_t i = 0; i < lws0.dimensions(); ++i) + for (size_t i = 0; i < lws0.dimensions(); ++i) { - if(lws0.get()[i] != lws1.get()[i]) + if (lws0.get()[i] != lws1.get()[i]) { return false; } @@ -71,7 +71,7 @@ private: * * @return The number of arguments enqueued per array object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_array() { return num_arguments_per_tensor<dimension_size>(); @@ -80,7 +80,7 @@ private: * * @return The number of arguments enqueued per tensor object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_tensor() { return 2 + 2 * dimension_size; @@ -116,11 +116,13 @@ protected: * @param[in] window The maximum window which will be returned by window() * @param[in] tuning_params_hint (Optional) Tuning parameters to use. */ - void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + void configure_internal(const Window &window, + CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), + 0)) { _tuning_params_hint = tuning_params_hint; - if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) + if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) { // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which // will be recalculated with use_dummy_work_items flag at run time again anyway. @@ -133,7 +135,13 @@ protected: public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange) + : _kernel(nullptr), + _target(GPUTarget::MIDGARD), + _config_id(arm_compute::default_config_id), + _max_workgroup_size(0), + _type(CLKernelType::UNKNOWN), + _tuning_params_hint(), + _cached_gws(cl::NullRange) { } /** Returns a reference to the OpenCL kernel of this object. @@ -161,7 +169,11 @@ public: * @param[in] window Window the kernel will be executed on. */ template <typename T> - void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + void add_1D_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window) { add_array_argument<T, 1>(idx, array, strides, num_dimensions, window); } @@ -184,7 +196,7 @@ public: */ void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_1D_tensor_argument(idx, tensor, window); } @@ -208,7 +220,7 @@ public: */ void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_2D_tensor_argument(idx, tensor, window); } @@ -469,7 +481,11 @@ private: * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> - void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window); /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. @@ -505,7 +521,11 @@ private: * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ -void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false); +void enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), + bool use_dummy_work_items = false); /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> -void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) +void ICLKernel::add_array_argument( + unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { ARM_COMPUTE_ERROR_ON(array == nullptr); // Calculate offset to the start of the window unsigned int offset_first_element = 0; - for(unsigned int n = 0; n < num_dimensions; ++n) + for (unsigned int n = 0; n < num_dimensions; ++n) { offset_first_element += window[n].start() * strides[n]; } @@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons unsigned int idx_start = idx; _kernel.setArg(idx++, array->cl_buffer()); - for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + for (unsigned int dimension = 0; dimension < dimension_size; dimension++) { _kernel.setArg<cl_uint>(idx++, strides[dimension]); _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step()); @@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx, - "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>()); + "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_array<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLKERNEL_H */ diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp index 5d8295bdfe..3f7edbb88d 100644 --- a/src/core/CL/ICLSimple2DKernel.cpp +++ b/src/core/CL/ICLSimple2DKernel.cpp @@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, slice); add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h index 5246492401..97bc1e58c2 100644 --- a/src/core/CL/ICLSimple2DKernel.h +++ b/src/core/CL/ICLSimple2DKernel.h @@ -37,5 +37,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */ diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp index fef1a86125..71d7d1f07b 100644 --- a/src/core/CL/ICLSimple3DKernel.cpp +++ b/src/core/CL/ICLSimple3DKernel.cpp @@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h index ff0b274663..5071b6b339 100644 --- a/src/core/CL/ICLSimple3DKernel.h +++ b/src/core/CL/ICLSimple3DKernel.h @@ -39,5 +39,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */ diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp index d67fefdf71..c31db8355f 100644 --- a/src/core/CL/ICLSimpleKernel.cpp +++ b/src/core/CL/ICLSimpleKernel.cpp @@ -22,30 +22,35 @@ * SOFTWARE. */ #include "src/core/CL/ICLSimpleKernel.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/WindowHelpers.h" using namespace arm_compute; -ICLSimpleKernel::ICLSimpleKernel() - : _input(nullptr), _output(nullptr) +ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr) { } -void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size) +void ICLSimpleKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined, + const BorderSize &border_size) { _input = input; _output = output; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); + Window win = + calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size); diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h index b35547a217..6afd7309aa 100644 --- a/src/core/CL/ICLSimpleKernel.h +++ b/src/core/CL/ICLSimpleKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -55,12 +56,16 @@ public: * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. * @param[in] border_size (Optional) Size of the border. */ - void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + void configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined = false, + const BorderSize &border_size = BorderSize()); protected: const ICLTensor *_input; ICLTensor *_output; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */ diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp index b541bff04a..0771db7f50 100644 --- a/src/core/CL/ICLTensor.cpp +++ b/src/core/CL/ICLTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -ICLTensor::ICLTensor() - : _mapping(nullptr) +ICLTensor::ICLTensor() : _mapping(nullptr) { } diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index b092dfb4e2..35421d025e 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -36,11 +36,7 @@ namespace arm_compute { -CLSymbols::CLSymbols() noexcept(false) - : _loaded( -{ - false, false -}) +CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false}) { } @@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get() bool CLSymbols::load_default() { - static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" }; + static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; - if(_loaded.first) + if (_loaded.first) { return _loaded.second; } @@ -62,34 +58,32 @@ bool CLSymbols::load_default() // Indicate that default loading has been tried _loaded.first = true; - if(load(libraries_filenames, /* use_loader */ false)) + if (load(libraries_filenames, /* use_loader */ false)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from shared library"); return true; } #ifdef __ANDROID__ // When running in NDK environment, the above libraries are not accessible. - static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" }; + static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"}; - if(load(android_libraries_filenames, /* use_loader */ true)) + if (load(android_libraries_filenames, /* use_loader */ true)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from android shared library"); return true; } #endif // __ANDROID__ // If not returned till here then libraries not found std::stringstream ss; - std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(libraries_filenames.begin(), libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #ifdef __ANDROID__ - std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #endif // __ANDROID__ std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl; return false; @@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u { void *handle = nullptr; unsigned int index = 0; - for(index = 0; index < libraries_filenames.size(); ++index) + for (index = 0; index < libraries_filenames.size(); ++index) { handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL); - if(handle != nullptr) + if (handle != nullptr) { break; } } - if(index == libraries_filenames.size()) + if (index == libraries_filenames.size()) { // Set status of loading to failed _loaded.second = false; @@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u } #ifdef __ANDROID__ - typedef void* (*loadOpenCLPointer_t)(const char* name); + typedef void *(*loadOpenCLPointer_t)(const char *name); loadOpenCLPointer_t loadOpenCLPointer; - if (use_loader) { + if (use_loader) + { typedef void (*enableOpenCL_t)(); - enableOpenCL_t enableOpenCL = - reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL")); + enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL")); enableOpenCL(); - loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>( - dlsym(handle, "loadOpenCLPointer")); - } else { + loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer")); + } + else + { loadOpenCLPointer = nullptr; } -#define LOAD_FUNCTION_PTR(func_name, _handle) \ - func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \ - loadOpenCLPointer(#func_name) : dlsym(handle, #func_name)); +#define LOAD_FUNCTION_PTR(func_name, _handle) \ + func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \ + : dlsym(handle, #func_name)); #else /* __ANDROID__ */ (void)use_loader; // Avoid unused warning #define LOAD_FUNCTION_PTR(func_name, handle) \ @@ -234,12 +229,11 @@ bool opencl_is_available() } } // namespace arm_compute -cl_int clEnqueueMarker(cl_command_queue command_queue, - cl_event *event) +cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, event); } @@ -249,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue, } } -cl_int clWaitForEvents(cl_uint num_events, - const cl_event *event_list) +cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_events, event_list); } @@ -264,12 +257,18 @@ cl_int clWaitForEvents(cl_uint num_events, } } -cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr, - size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMMap(cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void *svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event); } @@ -279,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_ } } -cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, + void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -298,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, alignment); } @@ -312,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMFree_ptr; - if(func != nullptr) + if (func != nullptr) { func(context, svm_pointer); } @@ -326,7 +328,7 @@ cl_int clGetContextInfo(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, param_name, param_value_size, param_value, param_value_size_ret); } @@ -343,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -360,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -370,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c } } -cl_context clCreateContext( - const cl_context_properties *properties, - cl_uint num_devices, - const cl_device_id *devices, - void (*pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, - cl_int *errcode_ret) +cl_context clCreateContext(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (*pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret); } @@ -398,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, device_type, pfn_notify, user_data, errcode_ret); } @@ -408,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, } } -cl_int clBuildProgram( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) +cl_int clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, num_devices, device_list, options, pfn_notify, user_data); } @@ -428,22 +428,22 @@ cl_int clBuildProgram( } } -cl_int clEnqueueNDRangeKernel( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, event); } else { @@ -451,15 +451,11 @@ cl_int clEnqueueNDRangeKernel( } } -cl_int clSetKernelArg( - cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void *arg_value) +cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, arg_index, arg_size, arg_value); } @@ -473,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -487,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -497,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj) } } -cl_int clEnqueueUnmapMemObject( - cl_command_queue command_queue, - cl_mem memobj, - void *mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -521,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -535,7 +530,7 @@ cl_int clReleaseContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -548,7 +543,7 @@ cl_int clReleaseEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -558,22 +553,22 @@ cl_int clReleaseEvent(cl_event event) } } -cl_int clEnqueueWriteBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -581,22 +576,22 @@ cl_int clEnqueueWriteBuffer( } } -cl_int clEnqueueReadBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -604,17 +599,16 @@ cl_int clEnqueueReadBuffer( } } -cl_int clGetProgramBuildInfo( - cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -628,7 +622,7 @@ cl_int clRetainProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -638,27 +632,27 @@ cl_int clRetainProgram(cl_program program) } } -void *clEnqueueMapBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret) +void *clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret); + return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, + event_wait_list, event, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -670,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -680,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) } } -cl_program clCreateProgramWithBinary( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const size_t *lengths, - const unsigned char **binaries, - cl_int *binary_status, - cl_int *errcode_ret) +cl_program clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -709,7 +702,7 @@ cl_int clRetainContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -723,7 +716,7 @@ cl_int clReleaseProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -737,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFlush_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -751,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFinish_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -761,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue) } } -cl_int clGetProgramInfo( - cl_program program, - cl_program_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, param_name, param_value_size, param_value, param_value_size_ret); } @@ -780,20 +772,17 @@ cl_int clGetProgramInfo( } } -cl_kernel clCreateKernel( - cl_program program, - const char *kernel_name, - cl_int *errcode_ret) +cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, kernel_name, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -805,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -815,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel) } } -cl_mem clCreateBuffer( - cl_context context, - cl_mem_flags flags, - size_t size, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -839,21 +823,17 @@ cl_mem clCreateBuffer( } cl_program clCreateProgramWithSource( - cl_context context, - cl_uint count, - const char **strings, - const size_t *lengths, - cl_int *errcode_ret) + cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, count, strings, lengths, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -865,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -878,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel) cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, - cl_device_id *devices, + cl_device_id *devices, cl_uint *num_devices) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, device_type, num_entries, devices, num_devices); } @@ -901,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id device, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -911,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id device, } } -cl_int clGetMemObjectInfo(cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetMemObjectInfo( + cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj, param_name, param_value_size, param_value, param_value_size_ret); } @@ -933,7 +910,7 @@ cl_int clRetainEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -951,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, param_name, param_value_size, param_value, param_value_size_ret); } @@ -965,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_entries, platforms, num_platforms); } @@ -975,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint } } -cl_int -clGetKernelWorkGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -995,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel kernel, } } -cl_int -clGetCommandQueueInfo(cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1014,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue command_queue, } } -cl_int -clGetKernelInfo(cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1033,16 +1007,15 @@ clGetKernelInfo(cl_kernel kernel, } } -cl_int -clGetEventProfilingInfo(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1052,23 +1025,22 @@ clGetEventProfilingInfo(cl_event event, } } -cl_mem -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - const cl_image_desc *image_desc, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateImage_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -1076,14 +1048,12 @@ clCreateImage(cl_context context, } } -cl_int clSetKernelExecInfo(cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void *param_value) +cl_int +clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value); } @@ -1093,22 +1063,21 @@ cl_int clSetKernelExecInfo(cl_kernel kernel, } } -cl_command_buffer_khr clCreateCommandBufferKHR( - cl_uint num_queues, - const cl_command_queue* queues, - const cl_command_buffer_properties_khr* properties, - cl_int* errcode_ret) +cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues, + const cl_command_queue *queues, + const cl_command_buffer_properties_khr *properties, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, properties, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_INVALID_OPERATION; } @@ -1122,7 +1091,7 @@ cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1137,7 +1106,7 @@ cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1152,7 +1121,7 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1162,18 +1131,17 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) } } -cl_int clEnqueueCommandBufferKHR( - cl_uint num_queues, - cl_command_queue* queues, - cl_command_buffer_khr command_buffer, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) +cl_int clEnqueueCommandBufferKHR(cl_uint num_queues, + cl_command_queue *queues, + cl_command_buffer_khr command_buffer, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event); } @@ -1183,27 +1151,26 @@ cl_int clEnqueueCommandBufferKHR( } } - -cl_int clCommandNDRangeKernelKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr* properties, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - const size_t* local_work_size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) +cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + const cl_ndrange_kernel_command_properties_khr *properties, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr *sync_point_wait_list, + cl_sync_point_khr *sync_point, + cl_mutable_command_khr *mutable_handle) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); + return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, + local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); } else { @@ -1211,14 +1178,13 @@ cl_int clCommandNDRangeKernelKHR( } } -cl_int clUpdateMutableCommandsKHR( - cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr* mutable_config) +cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr *mutable_config) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer, mutable_config); } @@ -1228,23 +1194,22 @@ cl_int clUpdateMutableCommandsKHR( } } -cl_mem -clImportMemoryARM(cl_context context, - cl_mem_flags flags, - const cl_import_properties_arm *properties, - void *memory, - size_t size, - cl_int *errcode_ret) +cl_mem clImportMemoryARM(cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, properties, memory, size, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h index 3f93c8d6fc..02faae2369 100644 --- a/src/core/CL/cl_kernels/activation_float_helpers.h +++ b/src/core/CL/cl_kernels/activation_float_helpers.h @@ -31,7 +31,8 @@ #endif // GPU_ARCH == GPU_ARCH_BIFROST // Hard-Swish -#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) +#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) // Logistic Activation #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) @@ -49,13 +50,16 @@ #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) // Leaky RELU Activation -#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) +#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) // Soft RELU Activation #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) // ELU Activation -#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) +#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \ + (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) // Absolute Activation #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) @@ -70,7 +74,8 @@ #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) // GELU Activation -#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) +#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) // Identity Activation #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h index c420578546..c758ff1278 100644 --- a/src/core/CL/cl_kernels/activation_quant_helpers.h +++ b/src/core/CL/cl_kernels/activation_quant_helpers.h @@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x) } #define ACTIVATION_OP2(op, x) op##_op(x) -#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) #if defined(S1_VAL) && defined(S2_VAL) #if defined(O1_VAL) && defined(O2_VAL) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) @@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL)); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) #endif /* defined(O1_VAL) && defined(O2_VAL) */ #else /* defined(S1_VAL) && defined(S2_VAL) */ -#define PERFORM_ACTIVATION_QUANT(act, data) \ - ({ \ - data = ACTIVATION_OP(act, data); \ - }) +#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); }) #endif /* defined(S1_VAL) && defined(S2_VAL) */ diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index 0e938cb668..4bef02314f 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -34,14 +34,14 @@ * */ #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) -#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) +#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) // offset == 0 -#define scalar_access_0_1(x) ((x).s0) -#define scalar_access_0_2(x) ((x).s01) -#define scalar_access_0_3(x) ((x).s012) -#define scalar_access_0_4(x) ((x).s0123) -#define scalar_access_0_8(x) ((x).s01234567) +#define scalar_access_0_1(x) ((x).s0) +#define scalar_access_0_2(x) ((x).s01) +#define scalar_access_0_3(x) ((x).s012) +#define scalar_access_0_4(x) ((x).s0123) +#define scalar_access_0_8(x) ((x).s01234567) #define scalar_access_0_16(x) ((x).s0123456789ABCDEF) // offset == 1 @@ -100,8 +100,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ - ({}) +#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({}) #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); @@ -186,8 +185,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) -#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_TENSOR /** Load 2D tensor (consecutive rows and columns) with Z offset. @@ -202,8 +203,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ - ({}) +#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({}) #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); @@ -279,8 +279,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @}*/ // end of group LOAD_TENSOR_M0XN0 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -395,8 +397,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_BLOCK /** Partially load the 0 to (n-1)th rows of the given variables @@ -517,8 +521,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** Load a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. @@ -541,22 +547,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in x but not y. * @@ -578,14 +585,15 @@ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \ + PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in y but not x. * @@ -607,14 +615,15 @@ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ -#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_COND_Y) \ + if (!(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** @} */ // end of group LOAD_BLOCK_PARTIAL /** Boundary-aware GeMM block load @@ -676,28 +685,33 @@ */ #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ - LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ + LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) -#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 +#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -795,8 +809,10 @@ * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) * @{ */ -#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) -#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) /** @} */ // end of group LOAD_TEXTURE2D /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded. @@ -815,7 +831,7 @@ #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##0; \ - if(Y_MASK##0 != 0) \ + if (Y_MASK##0 != 0) \ BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ else \ BASENAME##0 = 0; @@ -824,7 +840,7 @@ LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##1; \ - if(Y_MASK##1 != 0) \ + if (Y_MASK##1 != 0) \ BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ else \ BASENAME##1 = 0; @@ -833,7 +849,7 @@ LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##2; \ - if(Y_MASK##2 != 0) \ + if (Y_MASK##2 != 0) \ BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ else \ BASENAME##2 = 0; @@ -842,7 +858,7 @@ LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##3; \ - if(Y_MASK##3 != 0) \ + if (Y_MASK##3 != 0) \ BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ else \ BASENAME##3 = 0; @@ -851,7 +867,7 @@ LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##4; \ - if(Y_MASK##4 != 0) \ + if (Y_MASK##4 != 0) \ BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ else \ BASENAME##4 = 0; @@ -860,7 +876,7 @@ LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##5; \ - if(Y_MASK##5 != 0) \ + if (Y_MASK##5 != 0) \ BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ else \ BASENAME##5 = 0; @@ -869,7 +885,7 @@ LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##6; \ - if(Y_MASK##6 != 0) \ + if (Y_MASK##6 != 0) \ BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ else \ BASENAME##6 = 0; @@ -878,7 +894,7 @@ LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##7; \ - if(Y_MASK##7 != 0) \ + if (Y_MASK##7 != 0) \ BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ else \ BASENAME##7 = 0; @@ -887,7 +903,7 @@ LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##8; \ - if(Y_MASK##8 != 0) \ + if (Y_MASK##8 != 0) \ BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ else \ BASENAME##8 = 0; @@ -896,7 +912,7 @@ LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##9; \ - if(Y_MASK##9 != 0) \ + if (Y_MASK##9 != 0) \ BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ else \ BASENAME##9 = 0; @@ -905,7 +921,7 @@ LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##A; \ - if(Y_MASK##A != 0) \ + if (Y_MASK##A != 0) \ BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ else \ BASENAME##A = 0; @@ -914,7 +930,7 @@ LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##B; \ - if(Y_MASK##B != 0) \ + if (Y_MASK##B != 0) \ BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ else \ BASENAME##B = 0; @@ -923,7 +939,7 @@ LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##C; \ - if(Y_MASK##C != 0) \ + if (Y_MASK##C != 0) \ BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ else \ BASENAME##C = 0; @@ -932,7 +948,7 @@ LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##D; \ - if(Y_MASK##D != 0) \ + if (Y_MASK##D != 0) \ BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ else \ BASENAME##D = 0; @@ -941,7 +957,7 @@ LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##E; \ - if(Y_MASK##E != 0) \ + if (Y_MASK##E != 0) \ BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ else \ BASENAME##E = 0; @@ -950,7 +966,7 @@ LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##F; \ - if(Y_MASK##F != 0) \ + if (Y_MASK##F != 0) \ BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ else \ BASENAME##F = 0; @@ -976,8 +992,10 @@ * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0 * @{ */ -#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) -#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) /** @} */ // end of group LOAD_BLOCK_INDIRECT /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -1088,8 +1106,10 @@ * @param[in] STRIDE_Y The stride in y-axis direction * @{ */ -#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) -#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) /** @} */ // end of group LOAD_SCALAR_AS_VECTOR /** Basic macros to calculate Z offset values from Z0 to Zn-1 @@ -1187,8 +1207,10 @@ * @param[in] STRIDE_Y The stride value in y-axis direction * @{ */ -#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) -#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) /** @} */ // end of group CALCULATE_Z_OFFSET /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) @@ -1199,8 +1221,7 @@ * @param[in] SCALE The scale factor * @{ */ -#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ - BASENAME##0 *= (DATA_TYPE)SCALE; +#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE; #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ @@ -1275,7 +1296,7 @@ * @{ */ #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) -#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) +#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) /** @} */ // end of group SCALE_BLOCK /** Create a new vector containing the values at the given index for a set of given vectors @@ -1287,8 +1308,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); +#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); @@ -1297,13 +1317,20 @@ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 4) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); -#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 8) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); -#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); +#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); +#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \ + (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \ + (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); /** @} */ // end of group COLUMN_VECTORn /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector @@ -1315,8 +1342,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0)); +#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0)); #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); @@ -1329,9 +1355,10 @@ #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 8) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); -#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); +#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \ + (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); /** @} */ // end of group COLUMN_VECTOR_SCALARn /** Create transposed vectors of the given vectors @@ -1343,8 +1370,7 @@ * @param[in] TYPE The data type of the transposed vectors * @{ */ -#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ - COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); +#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); @@ -1417,8 +1443,7 @@ * @param[in] BIAS The basename of the added variables * @{ */ -#define ADD_ROW_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS##0; +#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0; #define ADD_ROW_2(BASENAME, BIAS) \ ADD_ROW_1(BASENAME, BIAS) \ @@ -1493,7 +1518,7 @@ * @{ */ #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) -#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) +#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK /** Broadcast (add single value) to the each element of the destination variables @@ -1503,8 +1528,7 @@ * @param[in] BIAS The variable containing the value to add * @{ */ -#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS; +#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS; #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ @@ -1578,7 +1602,7 @@ * @{ */ #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) -#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) +#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK_BROADCAST /** Apply activation to the given variables @@ -1668,8 +1692,10 @@ * @param[in] B_VAL Additional value required by the activation * @{ */ -#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) -#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) /** @} */ // end of group ACTIVATION_BLOCK /** Apply convert_<data_type> to the given variables @@ -1773,6 +1799,8 @@ * @param[in] BASENAME_DST The basename of the destination variables * @{ */ -#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) -#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) /** @} */ // end of group CONVERT_BLOCK diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index b2ceaf92f3..87a1875f93 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -81,11 +81,11 @@ * @return The reversed vector * @{ */ -#define REV1(x) ((x)) -#define REV2(x) ((x).s10) -#define REV3(x) ((x).s210) -#define REV4(x) ((x).s3210) -#define REV8(x) ((x).s76543210) +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) #define REV16(x) ((x).sFEDCBA9876543210) /** @} */ // end of group REVn @@ -99,7 +99,7 @@ * @{ */ #define REVERSE_STR(x, s) REV##s((x)) -#define REVERSE(x, s) REVERSE_STR(x, s) +#define REVERSE(x, s) REVERSE_STR(x, s) /** @} */ // end of group REVERSE /** Circular-right-shift (rotate-right) the vector of size s by the amount of n. @@ -138,16 +138,16 @@ #define ROT8_7(x) ((x).s12345670) #define ROT8_8(x) ((x)) -#define ROT16_0(x) ((x)) -#define ROT16_1(x) ((x).sF0123456789ABCDE) -#define ROT16_2(x) ((x).sEF0123456789ABCD) -#define ROT16_3(x) ((x).sDEF0123456789ABC) -#define ROT16_4(x) ((x).sCDEF0123456789AB) -#define ROT16_5(x) ((x).sBCDEF0123456789A) -#define ROT16_6(x) ((x).sABCDEF0123456789) -#define ROT16_7(x) ((x).s9ABCDEF012345678) -#define ROT16_8(x) ((x).s89ABCDEF01234567) -#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) #define ROT16_10(x) ((x).s6789ABCDEF012345) #define ROT16_11(x) ((x).s56789ABCDEF01234) #define ROT16_12(x) ((x).s456789ABCDEF0123) @@ -168,7 +168,7 @@ * @{ */ #define ROTATE_STR(x, s, n) ROT##s##_##n(x) -#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) /** @} */ // end of group ROTATE /** Creates a vector of size n filled with offset values corresponding to the location of each element. @@ -179,11 +179,11 @@ * @return The vector filled with offset values * @{ */ -#define V_OFFS1(dt) (dt##1)(0) -#define V_OFFS2(dt) (dt##2)(0, 1) -#define V_OFFS3(dt) (dt##3)(0, 1, 2) -#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) -#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS1(dt) (dt##1)(0) +#define V_OFFS2(dt) (dt##2)(0, 1) +#define V_OFFS3(dt) (dt##3)(0, 1, 2) +#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) /** @} */ // end of group V_OFFSn @@ -197,11 +197,11 @@ * @{ */ #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) -#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) /** @} */ // end of group VEC_OFFS #define VLOAD_STR(size) vload##size -#define VLOAD(size) VLOAD_STR(size) +#define VLOAD(size) VLOAD_STR(size) /** Extended partial vload that correctly handles scalar values as well. * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops @@ -219,23 +219,23 @@ * @{ */ #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size -#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) +#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) #define NO_LOAD(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vload_partial_1_0 NO_LOAD -#define vload_partial_1_1 vload1 -#define vload_partial_1_2 NO_LOAD -#define vload_partial_1_3 NO_LOAD -#define vload_partial_1_4 NO_LOAD -#define vload_partial_1_5 NO_LOAD -#define vload_partial_1_6 NO_LOAD -#define vload_partial_1_7 NO_LOAD -#define vload_partial_1_8 NO_LOAD -#define vload_partial_1_9 NO_LOAD +#define vload_partial_1_0 NO_LOAD +#define vload_partial_1_1 vload1 +#define vload_partial_1_2 NO_LOAD +#define vload_partial_1_3 NO_LOAD +#define vload_partial_1_4 NO_LOAD +#define vload_partial_1_5 NO_LOAD +#define vload_partial_1_6 NO_LOAD +#define vload_partial_1_7 NO_LOAD +#define vload_partial_1_8 NO_LOAD +#define vload_partial_1_9 NO_LOAD #define vload_partial_1_10 NO_LOAD #define vload_partial_1_11 NO_LOAD #define vload_partial_1_12 NO_LOAD @@ -244,16 +244,16 @@ #define vload_partial_1_15 NO_LOAD #define vload_partial_1_16 NO_LOAD // Size == 2 -#define vload_partial_2_0 NO_LOAD -#define vload_partial_2_1 vload_partial_1 -#define vload_partial_2_2 vload_partial_2 -#define vload_partial_2_3 NO_LOAD -#define vload_partial_2_4 NO_LOAD -#define vload_partial_2_5 NO_LOAD -#define vload_partial_2_6 NO_LOAD -#define vload_partial_2_7 NO_LOAD -#define vload_partial_2_8 NO_LOAD -#define vload_partial_2_9 NO_LOAD +#define vload_partial_2_0 NO_LOAD +#define vload_partial_2_1 vload_partial_1 +#define vload_partial_2_2 vload_partial_2 +#define vload_partial_2_3 NO_LOAD +#define vload_partial_2_4 NO_LOAD +#define vload_partial_2_5 NO_LOAD +#define vload_partial_2_6 NO_LOAD +#define vload_partial_2_7 NO_LOAD +#define vload_partial_2_8 NO_LOAD +#define vload_partial_2_9 NO_LOAD #define vload_partial_2_10 NO_LOAD #define vload_partial_2_11 NO_LOAD #define vload_partial_2_12 NO_LOAD @@ -262,16 +262,16 @@ #define vload_partial_2_15 NO_LOAD #define vload_partial_2_16 NO_LOAD // Size == 3 -#define vload_partial_3_0 NO_LOAD -#define vload_partial_3_1 vload_partial_1 -#define vload_partial_3_2 vload_partial_2 -#define vload_partial_3_3 vload_partial_3 -#define vload_partial_3_4 NO_LOAD -#define vload_partial_3_5 NO_LOAD -#define vload_partial_3_6 NO_LOAD -#define vload_partial_3_7 NO_LOAD -#define vload_partial_3_8 NO_LOAD -#define vload_partial_3_9 NO_LOAD +#define vload_partial_3_0 NO_LOAD +#define vload_partial_3_1 vload_partial_1 +#define vload_partial_3_2 vload_partial_2 +#define vload_partial_3_3 vload_partial_3 +#define vload_partial_3_4 NO_LOAD +#define vload_partial_3_5 NO_LOAD +#define vload_partial_3_6 NO_LOAD +#define vload_partial_3_7 NO_LOAD +#define vload_partial_3_8 NO_LOAD +#define vload_partial_3_9 NO_LOAD #define vload_partial_3_10 NO_LOAD #define vload_partial_3_11 NO_LOAD #define vload_partial_3_12 NO_LOAD @@ -280,16 +280,16 @@ #define vload_partial_3_15 NO_LOAD #define vload_partial_3_16 NO_LOAD // Size == 4 -#define vload_partial_4_0 NO_LOAD -#define vload_partial_4_1 vload_partial_1 -#define vload_partial_4_2 vload_partial_2 -#define vload_partial_4_3 vload_partial_3 -#define vload_partial_4_4 vload_partial_4 -#define vload_partial_4_5 NO_LOAD -#define vload_partial_4_6 NO_LOAD -#define vload_partial_4_7 NO_LOAD -#define vload_partial_4_8 NO_LOAD -#define vload_partial_4_9 NO_LOAD +#define vload_partial_4_0 NO_LOAD +#define vload_partial_4_1 vload_partial_1 +#define vload_partial_4_2 vload_partial_2 +#define vload_partial_4_3 vload_partial_3 +#define vload_partial_4_4 vload_partial_4 +#define vload_partial_4_5 NO_LOAD +#define vload_partial_4_6 NO_LOAD +#define vload_partial_4_7 NO_LOAD +#define vload_partial_4_8 NO_LOAD +#define vload_partial_4_9 NO_LOAD #define vload_partial_4_10 NO_LOAD #define vload_partial_4_11 NO_LOAD #define vload_partial_4_12 NO_LOAD @@ -298,16 +298,16 @@ #define vload_partial_4_15 NO_LOAD #define vload_partial_4_16 NO_LOAD // Size == 8 -#define vload_partial_8_0 NO_LOAD -#define vload_partial_8_1 vload_partial_1 -#define vload_partial_8_2 vload_partial_2 -#define vload_partial_8_3 vload_partial_3 -#define vload_partial_8_4 vload_partial_4 -#define vload_partial_8_5 vload_partial_5 -#define vload_partial_8_6 vload_partial_6 -#define vload_partial_8_7 vload_partial_7 -#define vload_partial_8_8 vload_partial_8 -#define vload_partial_8_9 NO_LOAD +#define vload_partial_8_0 NO_LOAD +#define vload_partial_8_1 vload_partial_1 +#define vload_partial_8_2 vload_partial_2 +#define vload_partial_8_3 vload_partial_3 +#define vload_partial_8_4 vload_partial_4 +#define vload_partial_8_5 vload_partial_5 +#define vload_partial_8_6 vload_partial_6 +#define vload_partial_8_7 vload_partial_7 +#define vload_partial_8_8 vload_partial_8 +#define vload_partial_8_9 NO_LOAD #define vload_partial_8_10 NO_LOAD #define vload_partial_8_11 NO_LOAD #define vload_partial_8_12 NO_LOAD @@ -316,16 +316,16 @@ #define vload_partial_8_15 NO_LOAD #define vload_partial_8_16 NO_LOAD // Size == 16 -#define vload_partial_16_0 NO_LOAD -#define vload_partial_16_1 vload_partial_1 -#define vload_partial_16_2 vload_partial_2 -#define vload_partial_16_3 vload_partial_3 -#define vload_partial_16_4 vload_partial_4 -#define vload_partial_16_5 vload_partial_5 -#define vload_partial_16_6 vload_partial_6 -#define vload_partial_16_7 vload_partial_7 -#define vload_partial_16_8 vload_partial_8 -#define vload_partial_16_9 vload_partial_9 +#define vload_partial_16_0 NO_LOAD +#define vload_partial_16_1 vload_partial_1 +#define vload_partial_16_2 vload_partial_2 +#define vload_partial_16_3 vload_partial_3 +#define vload_partial_16_4 vload_partial_4 +#define vload_partial_16_5 vload_partial_5 +#define vload_partial_16_6 vload_partial_6 +#define vload_partial_16_7 vload_partial_7 +#define vload_partial_16_8 vload_partial_8 +#define vload_partial_16_9 vload_partial_9 #define vload_partial_16_10 vload_partial_10 #define vload_partial_16_11 vload_partial_11 #define vload_partial_16_12 vload_partial_12 @@ -351,17 +351,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vload_partial_1(DATA, OFFSET, PTR) \ - DATA.s0 = vload1(OFFSET, PTR); +#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR); -#define vload_partial_2(DATA, OFFSET, PTR) \ - DATA.s01 = vload2(OFFSET, PTR); +#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR); -#define vload_partial_3(DATA, OFFSET, PTR) \ - DATA.s012 = vload3(OFFSET, PTR); +#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR); -#define vload_partial_4(DATA, OFFSET, PTR) \ - DATA.s0123 = vload4(OFFSET, PTR); +#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR); #define vload_partial_5(DATA, OFFSET, PTR) \ vload_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -375,8 +371,7 @@ vload_partial_4(DATA.s0123, OFFSET, PTR); \ vload_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vload_partial_8(DATA, OFFSET, PTR) \ - DATA.s01234567 = vload8(OFFSET, PTR); +#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR); #define vload_partial_9(DATA, OFFSET, PTR) \ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -406,13 +401,12 @@ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); -#define vload_partial_16(DATA, OFFSET, PTR) \ - DATA = vload16(OFFSET, PTR); +#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR); /** @} */ // end of groupd vload_partial_n /** @} */ // end of groupd VLOAD_PARTIAL -#define PIXEL_UNIT4 1 -#define PIXEL_UNIT8 2 +#define PIXEL_UNIT4 1 +#define PIXEL_UNIT8 2 #define PIXEL_UNIT16 4 /** Utility macro to convert a vector size in pixel unit. @@ -425,27 +419,45 @@ * @{ */ #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size -#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) +#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); -#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_floatx2(img, x_coord, y_coord) \ + (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_floatx4(img, x_coord, y_coord) \ + (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \ + read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); -#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_halfx2(img, x_coord, y_coord) \ + (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_halfx4(img, x_coord, y_coord) \ + (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \ + read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_floatx2(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_floatx4(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_halfx2(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_halfx4(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) /** Utility macro to read a 2D OpenCL image object. @@ -462,7 +474,7 @@ * @{ */ #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) -#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) +#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) /** @} */ /** Utility macro to write a 2D OpenCL image object. @@ -478,26 +490,28 @@ * * @{ */ -#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) -#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) +#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \ + write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) +#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \ + WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) /** @} */ #define VSTORE_STR(size) vstore##size -#define VSTORE(size) VSTORE_STR(size) +#define VSTORE(size) VSTORE_STR(size) -#define float1 float -#define half1 half -#define char1 char -#define uchar1 uchar -#define short1 short +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short #define ushort1 ushort -#define int1 int -#define uint1 uint -#define long1 long -#define ulong1 ulong +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong #define double1 double -#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vload1(OFFSET, PTR) *(OFFSET + PTR) #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA /** Extended partial vstore that correctly handles scalar values as well. @@ -516,23 +530,23 @@ * @{ */ #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size -#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) +#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) #define NO_STORE(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vstore_partial_1_0 NO_STORE -#define vstore_partial_1_1 vstore1 -#define vstore_partial_1_2 NO_STORE -#define vstore_partial_1_3 NO_STORE -#define vstore_partial_1_4 NO_STORE -#define vstore_partial_1_5 NO_STORE -#define vstore_partial_1_6 NO_STORE -#define vstore_partial_1_7 NO_STORE -#define vstore_partial_1_8 NO_STORE -#define vstore_partial_1_9 NO_STORE +#define vstore_partial_1_0 NO_STORE +#define vstore_partial_1_1 vstore1 +#define vstore_partial_1_2 NO_STORE +#define vstore_partial_1_3 NO_STORE +#define vstore_partial_1_4 NO_STORE +#define vstore_partial_1_5 NO_STORE +#define vstore_partial_1_6 NO_STORE +#define vstore_partial_1_7 NO_STORE +#define vstore_partial_1_8 NO_STORE +#define vstore_partial_1_9 NO_STORE #define vstore_partial_1_10 NO_STORE #define vstore_partial_1_11 NO_STORE #define vstore_partial_1_12 NO_STORE @@ -541,16 +555,16 @@ #define vstore_partial_1_15 NO_STORE #define vstore_partial_1_16 NO_STORE // Size == 2 -#define vstore_partial_2_0 NO_STORE -#define vstore_partial_2_1 vstore_partial_1 -#define vstore_partial_2_2 vstore_partial_2 -#define vstore_partial_2_3 NO_STORE -#define vstore_partial_2_4 NO_STORE -#define vstore_partial_2_5 NO_STORE -#define vstore_partial_2_6 NO_STORE -#define vstore_partial_2_7 NO_STORE -#define vstore_partial_2_8 NO_STORE -#define vstore_partial_2_9 NO_STORE +#define vstore_partial_2_0 NO_STORE +#define vstore_partial_2_1 vstore_partial_1 +#define vstore_partial_2_2 vstore_partial_2 +#define vstore_partial_2_3 NO_STORE +#define vstore_partial_2_4 NO_STORE +#define vstore_partial_2_5 NO_STORE +#define vstore_partial_2_6 NO_STORE +#define vstore_partial_2_7 NO_STORE +#define vstore_partial_2_8 NO_STORE +#define vstore_partial_2_9 NO_STORE #define vstore_partial_2_10 NO_STORE #define vstore_partial_2_11 NO_STORE #define vstore_partial_2_12 NO_STORE @@ -559,16 +573,16 @@ #define vstore_partial_2_15 NO_STORE #define vstore_partial_2_16 NO_STORE // Size == 3 -#define vstore_partial_3_0 NO_STORE -#define vstore_partial_3_1 vstore_partial_1 -#define vstore_partial_3_2 vstore_partial_2 -#define vstore_partial_3_3 vstore_partial_3 -#define vstore_partial_3_4 NO_STORE -#define vstore_partial_3_5 NO_STORE -#define vstore_partial_3_6 NO_STORE -#define vstore_partial_3_7 NO_STORE -#define vstore_partial_3_8 NO_STORE -#define vstore_partial_3_9 NO_STORE +#define vstore_partial_3_0 NO_STORE +#define vstore_partial_3_1 vstore_partial_1 +#define vstore_partial_3_2 vstore_partial_2 +#define vstore_partial_3_3 vstore_partial_3 +#define vstore_partial_3_4 NO_STORE +#define vstore_partial_3_5 NO_STORE +#define vstore_partial_3_6 NO_STORE +#define vstore_partial_3_7 NO_STORE +#define vstore_partial_3_8 NO_STORE +#define vstore_partial_3_9 NO_STORE #define vstore_partial_3_10 NO_STORE #define vstore_partial_3_11 NO_STORE #define vstore_partial_3_12 NO_STORE @@ -577,16 +591,16 @@ #define vstore_partial_3_15 NO_STORE #define vstore_partial_3_16 NO_STORE // Size == 4 -#define vstore_partial_4_0 NO_STORE -#define vstore_partial_4_1 vstore_partial_1 -#define vstore_partial_4_2 vstore_partial_2 -#define vstore_partial_4_3 vstore_partial_3 -#define vstore_partial_4_4 vstore_partial_4 -#define vstore_partial_4_5 NO_STORE -#define vstore_partial_4_6 NO_STORE -#define vstore_partial_4_7 NO_STORE -#define vstore_partial_4_8 NO_STORE -#define vstore_partial_4_9 NO_STORE +#define vstore_partial_4_0 NO_STORE +#define vstore_partial_4_1 vstore_partial_1 +#define vstore_partial_4_2 vstore_partial_2 +#define vstore_partial_4_3 vstore_partial_3 +#define vstore_partial_4_4 vstore_partial_4 +#define vstore_partial_4_5 NO_STORE +#define vstore_partial_4_6 NO_STORE +#define vstore_partial_4_7 NO_STORE +#define vstore_partial_4_8 NO_STORE +#define vstore_partial_4_9 NO_STORE #define vstore_partial_4_10 NO_STORE #define vstore_partial_4_11 NO_STORE #define vstore_partial_4_12 NO_STORE @@ -595,16 +609,16 @@ #define vstore_partial_4_15 NO_STORE #define vstore_partial_4_16 NO_STORE // Size == 8 -#define vstore_partial_8_0 NO_STORE -#define vstore_partial_8_1 vstore_partial_1 -#define vstore_partial_8_2 vstore_partial_2 -#define vstore_partial_8_3 vstore_partial_3 -#define vstore_partial_8_4 vstore_partial_4 -#define vstore_partial_8_5 vstore_partial_5 -#define vstore_partial_8_6 vstore_partial_6 -#define vstore_partial_8_7 vstore_partial_7 -#define vstore_partial_8_8 vstore_partial_8 -#define vstore_partial_8_9 NO_STORE +#define vstore_partial_8_0 NO_STORE +#define vstore_partial_8_1 vstore_partial_1 +#define vstore_partial_8_2 vstore_partial_2 +#define vstore_partial_8_3 vstore_partial_3 +#define vstore_partial_8_4 vstore_partial_4 +#define vstore_partial_8_5 vstore_partial_5 +#define vstore_partial_8_6 vstore_partial_6 +#define vstore_partial_8_7 vstore_partial_7 +#define vstore_partial_8_8 vstore_partial_8 +#define vstore_partial_8_9 NO_STORE #define vstore_partial_8_10 NO_STORE #define vstore_partial_8_11 NO_STORE #define vstore_partial_8_12 NO_STORE @@ -613,16 +627,16 @@ #define vstore_partial_8_15 NO_STORE #define vstore_partial_8_16 NO_STORE // Size == 16 -#define vstore_partial_16_0 NO_STORE -#define vstore_partial_16_1 vstore_partial_1 -#define vstore_partial_16_2 vstore_partial_2 -#define vstore_partial_16_3 vstore_partial_3 -#define vstore_partial_16_4 vstore_partial_4 -#define vstore_partial_16_5 vstore_partial_5 -#define vstore_partial_16_6 vstore_partial_6 -#define vstore_partial_16_7 vstore_partial_7 -#define vstore_partial_16_8 vstore_partial_8 -#define vstore_partial_16_9 vstore_partial_9 +#define vstore_partial_16_0 NO_STORE +#define vstore_partial_16_1 vstore_partial_1 +#define vstore_partial_16_2 vstore_partial_2 +#define vstore_partial_16_3 vstore_partial_3 +#define vstore_partial_16_4 vstore_partial_4 +#define vstore_partial_16_5 vstore_partial_5 +#define vstore_partial_16_6 vstore_partial_6 +#define vstore_partial_16_7 vstore_partial_7 +#define vstore_partial_16_8 vstore_partial_8 +#define vstore_partial_16_9 vstore_partial_9 #define vstore_partial_16_10 vstore_partial_10 #define vstore_partial_16_11 vstore_partial_11 #define vstore_partial_16_12 vstore_partial_12 @@ -648,17 +662,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vstore_partial_1(DATA, OFFSET, PTR) \ - vstore1(DATA.s0, OFFSET, PTR); +#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR); -#define vstore_partial_2(DATA, OFFSET, PTR) \ - vstore2(DATA.s01, OFFSET, PTR); +#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR); -#define vstore_partial_3(DATA, OFFSET, PTR) \ - vstore3(DATA.s012, OFFSET, PTR); +#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR); -#define vstore_partial_4(DATA, OFFSET, PTR) \ - vstore4(DATA.s0123, OFFSET, PTR); +#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR); #define vstore_partial_5(DATA, OFFSET, PTR) \ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -672,8 +682,7 @@ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ vstore_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vstore_partial_8(DATA, OFFSET, PTR) \ - vstore8(DATA.s01234567, OFFSET, PTR); +#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR); #define vstore_partial_9(DATA, OFFSET, PTR) \ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -703,186 +712,156 @@ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); -#define vstore_partial_16(DATA, OFFSET, PTR) \ - vstore16(DATA, OFFSET, PTR); +#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR); /** @} */ // end of groupd vstore_partial_n /** @} */ // end of groupd VSTORE_PARTIAL // Convert built-in functions with _sat modifier are not supported in floating point so we create defines // without _sat to overcome this issue -#define convert_float_sat convert_float -#define convert_float1_sat convert_float -#define convert_float2_sat convert_float2 -#define convert_float3_sat convert_float3 -#define convert_float4_sat convert_float4 -#define convert_float8_sat convert_float8 +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 #define convert_float16_sat convert_float16 -#define convert_half_sat convert_float -#define convert_half1_sat convert_half -#define convert_half2_sat convert_half2 -#define convert_half3_sat convert_half3 -#define convert_half4_sat convert_half4 -#define convert_half8_sat convert_half8 -#define convert_half16_sat convert_half16 - -#define convert_float1 convert_float -#define convert_half1 convert_half -#define convert_char1 convert_char -#define convert_uchar1 convert_uchar -#define convert_short1 convert_short +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short #define convert_ushort1 convert_ushort -#define convert_int1 convert_int -#define convert_uint1 convert_uint -#define convert_long1 convert_long -#define convert_ulong1 convert_ulong +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong #define convert_double1 convert_double -#define convert_char1_sat convert_char_sat -#define convert_uchar1_sat convert_uchar_sat -#define convert_uchar2_sat convert_uchar2_sat -#define convert_uchar3_sat convert_uchar3_sat -#define convert_uchar4_sat convert_uchar4_sat -#define convert_uchar8_sat convert_uchar8_sat +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_uchar2_sat convert_uchar2_sat +#define convert_uchar3_sat convert_uchar3_sat +#define convert_uchar4_sat convert_uchar4_sat +#define convert_uchar8_sat convert_uchar8_sat #define convert_uchar16_sat convert_uchar16_sat -#define convert_short1_sat convert_short_sat +#define convert_short1_sat convert_short_sat #define convert_ushort1_sat convert_ushort_sat -#define convert_int1_sat convert_int_sat -#define convert_uint1_sat convert_uint_sat -#define convert_long1_sat convert_long_sat -#define convert_ulong1_sat convert_ulong_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat #define convert_double1_sat convert_double_sat #define VEC_DATA_TYPE_STR(type, size) type##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) #define CONVERT_STR(x, type) (convert_##type((x))) -#define CONVERT(x, type) CONVERT_STR(x, type) +#define CONVERT(x, type) CONVERT_STR(x, type) #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) -#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) -#define select_vec_dt_uchar(size) uchar##size -#define select_vec_dt_char(size) char##size +#define select_vec_dt_uchar(size) uchar##size +#define select_vec_dt_char(size) char##size #define select_vec_dt_ushort(size) ushort##size -#define select_vec_dt_short(size) short##size -#define select_vec_dt_half(size) short##size -#define select_vec_dt_uint(size) uint##size -#define select_vec_dt_int(size) int##size -#define select_vec_dt_float(size) int##size -#define select_vec_dt_ulong(size) ulong##size -#define select_vec_dt_long(size) long##size +#define select_vec_dt_short(size) short##size +#define select_vec_dt_half(size) short##size +#define select_vec_dt_uint(size) uint##size +#define select_vec_dt_int(size) int##size +#define select_vec_dt_float(size) int##size +#define select_vec_dt_ulong(size) ulong##size +#define select_vec_dt_long(size) long##size #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) -#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) -#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) +#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) +#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) -#define signed_int_vec_dt_uchar(size) char##size -#define signed_int_vec_dt_char(size) char##size +#define signed_int_vec_dt_uchar(size) char##size +#define signed_int_vec_dt_char(size) char##size #define signed_int_vec_dt_ushort(size) short##size -#define signed_int_vec_dt_short(size) short##size -#define signed_int_vec_dt_half(size) short##size -#define signed_int_vec_dt_uint(size) int##size -#define signed_int_vec_dt_int(size) int##size -#define signed_int_vec_dt_float(size) int##size -#define signed_int_vec_dt_ulong(size) long##size -#define signed_int_vec_dt_long(size) long##size +#define signed_int_vec_dt_short(size) short##size +#define signed_int_vec_dt_half(size) short##size +#define signed_int_vec_dt_uint(size) int##size +#define signed_int_vec_dt_int(size) int##size +#define signed_int_vec_dt_float(size) int##size +#define signed_int_vec_dt_ulong(size) long##size +#define signed_int_vec_dt_long(size) long##size #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) -#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) -#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) - -#define sum_reduce_1(x) (x) -#define sum_reduce_2(x) ((x).s0) + ((x).s1) -#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) -#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) -#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) +#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) +#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) + +#define sum_reduce_1(x) (x) +#define sum_reduce_2(x) ((x).s0) + ((x).s1) +#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) +#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) +#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) -#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) +#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) -#define prod_reduce_1(x) (x) -#define prod_reduce_2(x) ((x).s0) * ((x).s1) -#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) -#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) -#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) +#define prod_reduce_1(x) (x) +#define prod_reduce_2(x) ((x).s0) * ((x).s1) +#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) +#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) +#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) -#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) +#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) -#define max_reduce_1(x) (x) -#define max_reduce_2(x) max(((x).s0), ((x).s1)) -#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) -#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) -#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) +#define max_reduce_1(x) (x) +#define max_reduce_2(x) max(((x).s0), ((x).s1)) +#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) +#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) +#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) #define MAX_REDUCE_STR(x, size) max_reduce_##size(x) -#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) +#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) -#define min_reduce_1(x) (x) -#define min_reduce_2(x) min(((x).s0), ((x).s1)) -#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) -#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) -#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) +#define min_reduce_1(x) (x) +#define min_reduce_2(x) min(((x).s0), ((x).s1)) +#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) +#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) +#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) #define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF)) #define MIN_REDUCE_STR(x, size) min_reduce_##size(x) -#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) - -#define VECTOR_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_offset_first_element_in_bytes - -#define IMAGE_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR3D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR4D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR5D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_stride_v, \ - uint name##_step_v, \ - uint name##_offset_first_element_in_bytes +#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR5D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v, \ + uint name##_step_v, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) @@ -890,38 +869,47 @@ #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) -#define CONVERT_TO_IMAGE_STRUCT(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y) #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0) -#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w, \ + name##_step_w, mod_size) -#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ - tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ + tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) /** Structure to hold Vector information */ typedef struct Vector @@ -970,10 +958,10 @@ typedef struct Tensor4D * * @return An image object */ -inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) +inline Vector +update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { - Vector vector = - { + Vector vector = { .ptr = ptr, .offset_first_element_in_bytes = offset_first_element_in_bytes, .stride_x = stride_x, @@ -993,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +inline Image update_image_workitem_ptr( + __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } @@ -1019,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; - img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return img; } @@ -1045,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o * * @return A 3D tensor object */ -inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return tensor; } @@ -1072,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi * * @return A 3D tensor object */ -inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; return tensor; } -inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, - uint step_w, - uint mod_size) +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z, + uint stride_w, + uint step_w, + uint mod_size) { - Tensor4D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w - }; - - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; return tensor; } @@ -1171,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid const uint x = index; - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + tensor->offset_first_element_in_bytes; } #endif // _HELPER_H diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 562c5d3236..166260a3c0 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -34,7 +34,7 @@ * @return The converted vector */ #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) /** Quantize a floating-point scalar value to 8-bit asymmetric * @@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ - { \ - const VEC_DATA_TYPE(int, size) \ - zero = (VEC_DATA_TYPE(int, size))0; \ - const VEC_DATA_TYPE(int, size) \ - one = (VEC_DATA_TYPE(int, size))1; \ - VEC_DATA_TYPE(int, size) \ - mask = (one << exponent) - one; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ - return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ + return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not. */ -#define ASYMM_SELECT_USING_MASK_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ - { \ - return (if_mask & then_val) ^ (~if_mask & else_val); \ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size( \ + VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ } /** For each element of input vector, the corresponding bits of the result item are set @@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ } -#define EXP_BARREL_SHIFTER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ - { \ - if(k_integer_bits > exponent) \ - { \ - const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ - return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ - } \ - \ - return result; \ +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ } /** Calculates \f$ exp(x) \f$ for x < 0. @@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ - { \ - const int k_fractional_bits = 31 - k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - k_one_quarter = 1 << (k_fractional_bits - 2); \ - VEC_DATA_TYPE(int, size) \ - mask = k_one_quarter - 1; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ - VEC_DATA_TYPE(int, size) \ - remainder = a_mod_quarter_minus_one_quarter - a; \ - \ - result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ - \ - if(k_integer_bits > 5) \ - { \ - const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ - result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \ + size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ } /** Calculates the product of a integer value by a power of two, with either a positive exponent @@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if(exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ VEC_DATA_TYPE(int, size) \ - half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ VEC_DATA_TYPE(int, size) \ x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ - for(int i = 0; i < 3; i++) \ + for (int i = 0; i < 3; i++) \ { \ VEC_DATA_TYPE(int, size) \ half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ @@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Rescaled value. */ -#define ASYMM_RESCALE_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ - { \ - int exponent = src_integer_bits - dst_integer_bits; \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ } -#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) -#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) -#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) -#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) -#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) -#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) +#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) -#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) -#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) -#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) -#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) -#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) - -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ +#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) + +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ } -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) QUANTIZE_IMPL(uchar, 1) QUANTIZE_IMPL(char, 1) diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h index 4ba2b2ca3a..4daf0adc89 100644 --- a/src/core/CL/cl_kernels/load_store_utility.h +++ b/src/core/CL/cl_kernels/load_store_utility.h @@ -223,8 +223,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group STORE_BLOCK /** Convert and store a block of the given size M0xN0 @@ -245,8 +247,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group CONVERT_STORE_BLOCK /** Partially store the 0 to (n-1)th rows of the given variables @@ -365,8 +369,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** Store a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. @@ -388,22 +394,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** Store a block that can only be partial in x but not y. * @@ -425,7 +432,7 @@ * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ + if (!(PARTIAL_COND_X)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -453,7 +460,7 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ + if (!(PARTIAL_COND_Y)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -517,23 +524,28 @@ #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 @@ -560,8 +572,7 @@ #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) #else // defined(PARTIAL_STORE_M0) -#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ - ((uint)(y * M0)) +#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0)) #endif // defined(PARTIAL_STORE_M0) /** @} */ // end of group COMPUTE_M0_START_ROW diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h index bed94a7b3b..cb2f4b0319 100644 --- a/src/core/CL/cl_kernels/repeat.h +++ b/src/core/CL/cl_kernels/repeat.h @@ -75,7 +75,9 @@ P_X##_DEF(F, P_A, P_B, P_C); \ REPEAT_3_15(P_X, P_A, P_B, P_C) -#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \ + REPEAT_3_##P_NUM(P_OP, P_A, P_B, \ + P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) // Repeat macros with 4 param, excluding the implicit ID param @@ -126,52 +128,59 @@ P_X##_DEF(F, P_A, P_B, P_C, P_D); \ REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) -#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \ + REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, \ + P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) +#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) -#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) +#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \ + REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL +#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...) -#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL +#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC -#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) +#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID -#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) +#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) +#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) +#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables. #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ @@ -182,6 +191,7 @@ VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ }) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) #endif // ARM_COMPUTE_REPEAT_H diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h index 642483ab3c..6595bd1981 100644 --- a/src/core/CL/cl_kernels/warp_helpers.h +++ b/src/core/CL/cl_kernels/warp_helpers.h @@ -31,11 +31,13 @@ * @param[in] border_size Border size of the image * */ -inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) +inline const float8 +clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) { const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); - return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, + clamped_y.s3); } /** Clamps the given coordinates to the borders. @@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int */ inline const float8 get_neighbour_coords(const float2 coord) { - return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, + /*br*/ coord.s0 + 1, coord.s1 + 1); } /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values @@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord) * @param[in] height Height of the image * @param[in] border_size Border size */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border( + const Image *in, const float8 coords, const float width, const float height, const float border_size) { // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. // Sets the 4x4 coordinates for each of the four input texels const float8 fc = floor(coords); - const float16 c1 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); - const float16 c2 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); + const float16 c1 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); + const float16 c2 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); // Loads the values from the input image const float16 t = (float16)( - /* tl, tr, bl, br */ - * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), - *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), - *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), - *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), - *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), - *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), - *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), - *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); - const float8 a = coords - fc; - const float8 b = ((float8)(1.f)) - a; - const float4 fr = (float4)( - ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), - ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), - ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), - ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + /* tl, tr, bl, br */ + *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), + *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), + *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), + *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), + *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), + *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), + *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), + *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = + (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4)); } @@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const * @param[in] width Width of the image * @param[in] height Height of the image */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) + bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) { return bilinear_interpolate_with_border(in, coords, width, height, 1); } diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp index 2728958add..5b72354abe 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, + DataType::U64); } return Status{}; @@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape{ input->info()->tensor_shape() }; + TensorShape output_shape{input->info()->tensor_shape()}; output_shape.set(axis, 1); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(DataType::S32) + .reset_padding() + .set_is_resizable(true)); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Set build options const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0)); - const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. + const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) + ? 2U + : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size)); build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); @@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Create kernel std::string kernel_axis_name; - switch(axis) + switch (axis) { case 0: build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { @@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) Window out_window(window); Window in_window(window); out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + in_window.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u)); // Get first input and output slices @@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } break; case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h index 5f36bdf113..fb3b41b0de 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -72,7 +73,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel. * @@ -84,7 +89,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp index 3fa8a8edaa..c88a852a44 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -23,58 +23,64 @@ */ #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); - if(beta != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); } - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output) { - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->element_size(), input->dimension(0)); // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); bool window_changed = false; - if(output != nullptr) + if (output != nullptr) { AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); window_changed = update_window_and_padding(win, input_access, output_access); @@ -104,30 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel() - : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false) + : _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _beta(nullptr), + _gamma(nullptr), + _epsilon(0), + _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } -void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); - auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma }); + auto padding_info = get_padding_info({input, output, mean, var, beta, gamma}); _input = input; _output = output; _mean = mean; @@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); - unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); @@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA"); // Create kernel - _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = + create_kernel(compile_context, + "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Set kernel static arguments unsigned int include_output = (!_run_in_place) ? 1 : 0; - unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - if(_beta != nullptr) + unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + if (_beta != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip beta parameter } - if(_gamma != nullptr) + if (_gamma != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip gamma parameter } _kernel.setArg<cl_float>(idx++, _epsilon); - if(output != nullptr) + if (output != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } // Configure kernel window - if(input->info()->data_layout() == DataLayout::NHWC) + if (input->info()->data_layout() == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); } -Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); - if(input->data_layout() != DataLayout::NHWC) + if (input->data_layout() != DataLayout::NHWC) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) + .first); } return Status{}; @@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor(); add_1D_tensor_argument(idx, _mean, vector_slice); add_1D_tensor_argument(idx, _var, vector_slice); - if(_beta != nullptr) + if (_beta != nullptr) { add_1D_tensor_argument(idx, _beta, vector_slice); } - if(_gamma != nullptr) + if (_gamma != nullptr) { add_1D_tensor_argument(idx, _gamma, vector_slice); } @@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue { idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h index acbe0f2a26..1a88d2a8c5 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Set the input and output tensors. * @@ -82,8 +89,15 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, - const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. @@ -99,10 +113,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp index 143a842d02..c640b5a8d6 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp @@ -25,13 +25,14 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" -#include "arm_compute/core/TensorInfo.h" using namespace arm_compute::misc::shape_calculator; namespace arm_compute @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } } // namespace -CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr) +CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, block_shape, output }); + auto padding_info = get_padding_info({input, block_shape, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info())); @@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); - _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); - + _kernel = create_kernel(compile_context, + "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, - const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x)); build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y)); build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left)); build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top)); - _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu unsigned int idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_argument(idx, batch_id); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { add_1D_tensor_argument(idx, _block_shape, vector_slice); } @@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h index a05184cd5b..b9d3e66fe2 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -65,7 +66,10 @@ public: * * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -74,7 +78,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] compile_context The compile context to be used. @@ -84,7 +92,12 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -106,7 +119,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info); + static Status validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp index 11e6d021a5..de3fb43de8 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.cpp +++ b/src/core/CL/kernels/CLBitwiseKernel.cpp @@ -28,25 +28,29 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" namespace arm_compute { -CLBitwiseKernel::CLBitwiseKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op) +void CLBitwiseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); - if(op != BitwiseOperation::NOT) + if (op != BitwiseOperation::NOT) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); @@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Output auto inizialitation if not yet initialized auto_init_if_empty(*(output->info()), *(input1->info())); - auto padding_info = get_padding_info({ input1, input2, output }); + auto padding_info = get_padding_info({input1, input2, output}); // Configure kernel window const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0)); @@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Create kernel std::string kernel_name = ""; - switch(op) + switch (op) { case BitwiseOperation::AND: kernel_name = "bitwise_and"; @@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_2D_tensor_argument(idx, _input1, slice); - if(_input2 != nullptr) + if (_input2 != nullptr) { add_2D_tensor_argument(idx, _input2, slice); } add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h index c5a999643d..2c74955ae4 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.h +++ b/src/core/CL/kernels/CLBitwiseKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp index 72de854afb..f32c518e29 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes); @@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16; - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f); @@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f); @@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe } } // namespace -CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() - : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) +CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); - auto padding_info = get_padding_info({ boxes, pred_boxes, deltas }); - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto padding_info = get_padding_info({boxes, pred_boxes, deltas}); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); @@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale())); build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1"); - if(is_quantized) + if (is_quantized) { build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type())); const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform(); @@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor const unsigned int num_elems_processed_per_iteration = 4; - Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h index 08f350e86a..9a1bb49bb9 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h @@ -58,7 +58,10 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -71,7 +74,11 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -85,7 +92,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp index a2a0bc4fb4..ec58bf9e7a 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); // There cannot be more groups than channels ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -70,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen auto_init_if_empty(*output, *input->clone()); const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - if(is_nhwc) + if (is_nhwc) { - unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); - Window win_collapsed = win.collapse(win, Window::DimZ); + unsigned int num_elems_processed_per_iteration_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); + Window win_collapsed = win.collapse(win, Window::DimZ); return std::make_pair(Status{}, win_collapsed); } else @@ -83,22 +89,25 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen constexpr unsigned int num_elems_processed_per_iteration_y = 2; // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + Window win = calculate_max_window( + *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); const bool window_changed = update_window_and_padding(win, input_access, output_access); Window win_collapsed = win.collapse(win, Window::DimZ); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win_collapsed); } } } // namespace -CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr) +CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - const bool is_nhwc = data_layout == DataLayout::NHWC; - const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - unsigned int vec_size_x = 0; - unsigned int vec_size_x_leftovers = 0; - if(is_nhwc) + const DataLayout data_layout = input->info()->data_layout(); + const bool is_nhwc = data_layout == DataLayout::NHWC; + const unsigned int channels = + input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + unsigned int vec_size_x = 0; + unsigned int vec_size_x_leftovers = 0; + if (is_nhwc) { - vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; } else @@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont _config_id += support::cpp11::to_string(output->info()->dimension(1)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(2)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h index 31c007f17e..43c939ebd8 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups); /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel * * @param[in] input Input tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp index f4d6316517..f27270733e 100644 --- a/src/core/CL/kernels/CLComparisonKernel.cpp +++ b/src/core/CL/kernels/CLComparisonKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,14 +39,10 @@ namespace arm_compute namespace { // Create supported comparisons map -const std::map<ComparisonOperation, std::string> supported_comparison_ops = -{ - { ComparisonOperation::Equal, "EQUAL" }, - { ComparisonOperation::NotEqual, "NOTEQUAL" }, - { ComparisonOperation::Greater, "GREATER" }, - { ComparisonOperation::GreaterEqual, "GREATEREQUAL" }, - { ComparisonOperation::Less, "LESS" }, - { ComparisonOperation::LessEqual, "LESSEQUAL" }, +const std::map<ComparisonOperation, std::string> supported_comparison_ops = { + {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"}, + {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"}, + {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"}, }; int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) @@ -53,7 +50,10 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) return 16 / input.element_size(); } -Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation) +Status validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1); ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN); @@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured output - if(output.total_size() > 0) + if (output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), @@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { - const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1); // Auto initialize output if not initialized @@ -90,27 +90,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -CLComparisonKernel::CLComparisonKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation)); @@ -129,10 +136,11 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons // Set kernel build options std::set<std::string> build_opts; build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); build_opts.emplace("-DOP=" + operation_name); build_opts.emplace("-DOP_NAME=" + lower_string(operation_name)); - if(is_data_type_quantized(input1->info()->data_type())) + if (is_data_type_quantized(input1->info()->data_type())) { const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform(); @@ -160,12 +168,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons _config_id += lower_string(string_from_data_layout(input1->info()->data_layout())); } -Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparisonKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); return Status{}; } @@ -181,17 +193,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -212,16 +225,16 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } BorderSize CLComparisonKernel::border_size() const { const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info()); - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize{ 0, border, 0, 0 }; + const unsigned int replicateSize = + _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize{0, border, 0, 0}; } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h index 0b94190183..174a6c9bf9 100644 --- a/src/core/CL/kernels/CLComparisonKernel.h +++ b/src/core/CL/kernels/CLComparisonKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLCOMPARISONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] operation Comparison operation to use. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation); /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel * * @param[in] input1 Source tensor. Data types supported: All. @@ -74,10 +79,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp index 76af5d564a..f8ecc4c098 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, +Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, const PadStrideInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); - for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); } @@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co return Status{}; } -void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; const int out_step_y = _info.stride().second; - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); + } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); break; } case DataLayout::NHWC: @@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); break; } default: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h index e0d1322341..762989a836 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h @@ -62,7 +62,10 @@ public: * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample * * @param[in] input Source tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp index 0fc0ff8168..b33e0a8b6f 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -38,7 +39,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info); @@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first); ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32); - if(!is_qasymm) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32); + if (!is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * + weights_info->dimension(idx_b)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b)); - if(bias != nullptr) + if (bias != nullptr) { - if(is_qasymm) + if (is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), + stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = + deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info())); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_layout(data_layout) + .set_quantization_info(input->quantization_info())); Window win = calculate_max_window(*input); @@ -109,29 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input } } // namespace -CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() - : _add_bias(false), - _bias(nullptr) +CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info); } -void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, - const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info) +void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), + output->info(), input_info, weights_info, deconv_info)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({input, bias, output}); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); + auto win_config = + validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); const DataLayout data_layout = input_info->data_layout(); @@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info)); @@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu unsigned int idx = 0; add_3D_tensor_argument(idx, _input, collapsed); add_3D_tensor_argument(idx, _output, collapsed); - if(_add_bias) + if (_add_bias) { add_1D_tensor_argument(idx, _bias, collapsed); } diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h index ce354fa86f..8f436b07e3 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h @@ -67,7 +67,12 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + void configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Initialise the kernel's source and destination. * * @param[in] compile_context The compile context to be used. @@ -79,8 +84,13 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel. * @@ -93,7 +103,12 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp index 5c1dc4fbf6..cdf19ab2e1 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); @@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); @@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_in, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_in)); + } while (window.slide_window_slice_3D(slice_in)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h index 1f7f77b569..cef70c4dda 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index e34b6929e7..b95abe795f 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -23,16 +23,17 @@ */ #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/ICLKernel.h" @@ -45,12 +46,18 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); bool in_place = false; - if(output == nullptr || output == input) + if (output == nullptr || output == input) { in_place = true; output = input; @@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && + (export_to_cl_image(weights) == false), + "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); @@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier)); // In place restrictions - if(in_place) + if (in_place) { - const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U); + const int weights_width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const int weights_height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || + weights->tensor_shape()[weights_height_idx] != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it + ARM_COMPUTE_RETURN_ERROR_ON( + conv_info.pad_stride_info + .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it } - const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation }; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); + const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), + conv_info.dilation}; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); - if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) + if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) { ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0); } const bool is_quantized = is_data_type_quantized(input->data_type()); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } } - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); @@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0)); @@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - if(is_data_type_quantized(input->data_type())) + if (is_data_type_quantized(input->data_type())) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + const UniformQuantizationInfo oq_info = + (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); } return Status{}; @@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _type = CLKernelType::DEPTHWISE; } -void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, + output_multipliers, output_shifts); } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - if(output == nullptr) + if (output == nullptr) { // In-place output = input; } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info, + conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info); - auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( + *(input->info()), *(weights->info()), conv_info); + auto_init_if_empty(*(output->info()), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_quantization_info(output->info()->quantization_info())); _input = input; _output = output; @@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & CLBuildOptions build_opts; // Update the padding for the input/weights tensor if we can export to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info()); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info()); } @@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & const auto act_function = conv_info.act_info.activation(); const auto dst_data_type = _output->info()->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1)); - build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0)); + build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", + "-DN0_A=" + support::cpp11::to_string(n0)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0)); build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION"); // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll - set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1), - static_cast<int>(_weights->info()->dimension(1)), - static_cast<int>(_weights->info()->dimension(2)) - }); + set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1), + static_cast<int>(_weights->info()->dimension(1)), + static_cast<int>(_weights->info()->dimension(2))}); - if(biases != nullptr) + if (biases != nullptr) { build_opts.add_option(std::string("-DHAS_BIAS")); - build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); + build_opts.add_option( + std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); } - if(_is_quantized) + if (_is_quantized) { kernel_name = "dwc_native_quantized_nhwc"; const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform(); @@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); - build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); - build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type())); - build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); + build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); + build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_shifts->info()->data_type())); + build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, + "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach int a_val{}; int b_val{}; - std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); + std::tie(b_val, a_val) = + get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val)); build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val)); @@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & { kernel_name = "dwc_native_fp_nhwc"; build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); } Window win = calculate_max_window(*(output->info()), Steps(n0, m0)); @@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & _config_id += string_from_data_type(input->info()->data_type()); } -Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); return Status{}; } @@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm cl::Image2D input_cl_image; cl::Image2D weights_cl_image; - if(_export_input_to_cl_image || _export_weights_to_cl_image) + if (_export_input_to_cl_image || _export_weights_to_cl_image) { // Export cl_buffer to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { - const size_t image_w = _input->info()->dimension(0) / 4; - const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); + const size_t image_w = _input->info()->dimension(0) / 4; + const size_t image_h = + _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _input->info()->strides_in_bytes()[1]; - input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + input_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, + _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { - const size_t image_w = _weights->info()->dimension(0) / 4; - const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); + const size_t image_w = _weights->info()->dimension(0) / 4; + const size_t image_h = + _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch, - CLImage2DType::ReadOnly); + weights_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, + _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } } unsigned int idx = 0; - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { _kernel.setArg(idx++, input_cl_image); } add_4d_tensor_nhwc_argument(idx, _input); add_4d_tensor_nhwc_argument(idx, _output); - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, _weights); - if(_is_quantized) + if (_is_quantized) { add_1D_tensor_argument(idx, _output_multipliers, slice); add_1D_tensor_argument(idx, _output_shifts, slice); } - if(_biases != nullptr) + if (_biases != nullptr) { add_1D_tensor_argument(idx, _biases, slice); } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 8eee7b2500..d34a662966 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/function_info/ConvolutionInfo.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { class ICLTensor; @@ -74,15 +74,28 @@ public: * * no padding * * no change of data layout after configure */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() */ - void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * @@ -90,23 +103,29 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input {}; + const ICLTensor *_input{}; const ICLTensor *_weights{}; const ICLTensor *_biases{}; ICLTensor *_output{}; - unsigned int _depth_multiplier{ 0 }; + unsigned int _depth_multiplier{0}; const ICLTensor *_output_multipliers{}; const ICLTensor *_output_shifts{}; - bool _export_input_to_cl_image{ false }; - bool _export_weights_to_cl_image{ true }; - bool _is_quantized{ false }; + bool _export_input_to_cl_image{false}; + bool _export_weights_to_cl_image{true}; + bool _is_quantized{false}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp index 9b514ed705..3d8f875ef7 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,17 +38,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -69,21 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() - : _input(nullptr), _output(nullptr), _idx(nullptr) +CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config); } -void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); - auto padding_info = get_padding_info({ input, output, idx }); + auto padding_info = get_padding_info({input, output, idx}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); _input = input; @@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _output, slice); add_1D_tensor_argument(idx, _idx, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h index e5583a4c22..fdd1bcc3d3 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -56,7 +56,8 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void + configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -65,7 +66,11 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. @@ -75,7 +80,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp index 95f4b640bd..3729e6b77d 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -76,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTRadixStageKernel::CLFFTRadixStageKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config) +void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); - auto padding_info = get_padding_info({ input, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments if not the first stage - if(!config.is_first_stage) + if (!config.is_first_stage) { const unsigned int Ni = config.Nx * config.radix; const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni); - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg<cl_uint>(idx++, config.Nx); _kernel.setArg<cl_uint>(idx++, Ni); _kernel.setArg<cl_float>(idx, exp_const); @@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set<unsigned int> CLFFTRadixStageKernel::supported_radix() { - return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 }; + return std::set<unsigned int>{2, 3, 4, 5, 7, 8}; } void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) @@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h index 9bb310db83..de80bfced3 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.h +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + #include <set> namespace arm_compute @@ -69,7 +69,10 @@ public: * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input * @param[in] config FFT descriptor metadata. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp index 8a714d71bf..be6e16b074 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.cpp +++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) } } // namespace -CLFFTScaleKernel::CLFFTScaleKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config) +void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen // Create kernel CLBuildOptions build_opts; build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() + : input->info()->num_channels())); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option_if(config.conjugate, "-DCONJ"); std::string kernel_name = "fft_scale_conj"; _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg<cl_float>(idx, config.scale); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); @@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h index cc518be193..b995282e02 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.h +++ b/src/core/CL/kernels/CLFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H #define ARM_COMPUTE_CLFFTSCALEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -63,7 +63,10 @@ public: * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] config Kernel configuration */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index fcd99a4ed9..86bb502da3 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -31,14 +31,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { -CLFillBorderKernel::CLFillBorderKernel() - : ICLKernel(), _tensor(nullptr) +CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue ICLKernel::add_argument<T>(idx, static_cast<T>(value)); } -void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { _tensor = tensor; configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1); - auto padding_info = get_padding_info({ tensor }); + auto padding_info = get_padding_info({tensor}); border_size.limit(tensor->padding()); // If there is no border: early exit - if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + if (border_size.empty() || border_mode == BorderMode::UNDEFINED) { return; } @@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Create static kernel arguments - const unsigned int valid_width = tensor->valid_region().shape[0]; - const unsigned int valid_height = tensor->valid_region().shape[1]; - const cl_int2 valid_region_coords = - { - { - static_cast<cl_int>(tensor->valid_region().anchor[0]), - static_cast<cl_int>(tensor->valid_region().anchor[1]), - } - }; - const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + const unsigned int valid_width = tensor->valid_region().shape[0]; + const unsigned int valid_height = tensor->valid_region().shape[1]; + const cl_int2 valid_region_coords = {{ + static_cast<cl_int>(tensor->valid_region().anchor[0]), + static_cast<cl_int>(tensor->valid_region().anchor[1]), + }}; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; // Set static kernel arguments unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters ICLKernel::add_argument<cl_uint>(idx, valid_width); ICLKernel::add_argument<cl_uint>(idx, valid_height); ICLKernel::add_argument<cl_int2>(idx, valid_region_coords); - if(BorderMode::CONSTANT == border_mode) + if (BorderMode::CONSTANT == border_mode) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } - const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto tensor = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); @@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl:: unsigned int idx = 0; add_3D_tensor_argument(idx, tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } @@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h index 7951f48171..5782143cf9 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.h +++ b/src/core/CL/kernels/CLFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -57,7 +58,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. @@ -65,7 +70,10 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in] compile_context The compile context to be used. @@ -74,7 +82,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Function to set the constant value on fill border kernel depending on type. * diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp index 68fe324df6..7da0679ae4 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp @@ -30,20 +30,26 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false) { _type = CLKernelType::ELEMENTWISE; } -void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); - auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma }); + auto padding_info = + get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma}); _input_weights = input_weights; _input_bias = input_bias; @@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c _epsilon = epsilon; _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights); - _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); + _run_in_place_bias = + (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); // Configure kernel window Window win = calculate_max_window(*input_weights->info()); @@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type())); - build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); + build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, + "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W"); @@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl:: // Add kernel arguments unsigned int idx = 0; add_3D_tensor_argument(idx, _input_weights, slice_3d); - if(_input_bias != nullptr) + if (_input_bias != nullptr) { add_1D_tensor_argument(idx, _input_bias, slice_1d); } add_1D_tensor_argument(idx, _bn_mean, slice_1d); add_1D_tensor_argument(idx, _bn_var, slice_1d); - if(!_run_in_place_weights) + if (!_run_in_place_weights) { add_3D_tensor_argument(idx, _fused_weights, slice_3d); } - if(!_run_in_place_bias) + if (!_run_in_place_bias) { add_1D_tensor_argument(idx, _fused_bias, slice_1d); } - if(_bn_beta != nullptr) + if (_bn_beta != nullptr) { add_1D_tensor_argument(idx, _bn_beta, slice_1d); } - if(_bn_gamma != nullptr) + if (_bn_gamma != nullptr) { add_1D_tensor_argument(idx, _bn_gamma, slice_1d); } diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h index 78b1e74cab..76ec7a759f 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h @@ -62,9 +62,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Set the source, destination of the kernel * * @param[in] compile_context The compile context to be used. @@ -81,9 +88,17 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -101,10 +116,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp index 5495023b80..c11a18940a 100644 --- a/src/core/CL/kernels/CLGatherKernel.cpp +++ b/src/core/CL/kernels/CLGatherKernel.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLGatherKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -34,7 +36,8 @@ namespace arm_compute { namespace { -inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +inline Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); @@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); // Output auto initialization if not yet initialized - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); auto_init_if_empty((*output), output_shape, 1, input->data_type()); // Create window @@ -72,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace -CLGatherKernel::CLGatherKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { _type = CLKernelType::ELEMENTWISE; } @@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGatherKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); - auto padding_info = get_padding_info({ input, output, indices }); + auto padding_info = get_padding_info({input, output, indices}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window @@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC // Set build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2))); build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); @@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); return Status{}; } diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h index 8f472a4696..db4b49d2f5 100644 --- a/src/core/CL/kernels/CLGatherKernel.h +++ b/src/core/CL/kernels/CLGatherKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel * @@ -74,7 +79,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); + static Status + validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp index 088c454f3c..b9ff72b928 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { size_t feature_height = info.feat_height(); size_t feature_width = info.feat_width(); @@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc } } // namespace -CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() - : _anchors(nullptr), _all_anchors(nullptr) +CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info); } -void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors); - auto padding_info = get_padding_info({ anchors, all_anchors }); + auto padding_info = get_padding_info({anchors, all_anchors}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info)); // Metadata @@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors)); build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi())); - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); @@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex } // Create kernel - const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + const std::string kernel_name = + (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields). // This means we don't need to pad on the X dimension, as we know in advance how many fields @@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h index d26795ac7d..e08f281d6c 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h @@ -62,7 +62,10 @@ public: * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel * @@ -81,5 +84,5 @@ private: const ICLTensor *_anchors; ICLTensor *_all_anchors; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp index 7ed323c950..b13eb16556 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -39,17 +40,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; @@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } } // namespace -CLComputeMeanVariance::CLComputeMeanVariance() - : _input(nullptr), _output(nullptr) +CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision) +void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + bool use_mixed_precision) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); CLBuildOptions build_opts; - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const TensorShape out_shape(input_channel, 2u, input_batches); // Output auto initialization if not yet initialized - if(use_mixed_precision) + if (use_mixed_precision) { auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32); } @@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue) Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info) +void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision + ? "float" + : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); @@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(1)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type()); } @@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info)); return Status{}; @@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu add_4D_tensor_argument(idx, _input, collapsed_window); add_3D_tensor_argument(idx, _mean, collapsed_window); - if(!_run_in_place) + if (!_run_in_place) { add_4D_tensor_argument(idx, _output, collapsed_window); } diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h index 2f9014a651..9f436da7f6 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] info Kernel meta-data descriptor */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * @@ -69,7 +73,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -106,7 +111,8 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] use_mixed_precision Use mixed precision in case of FP16 execution */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); + void + configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp index 542d380e4a..9ed9d7c5b0 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp @@ -31,10 +31,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -43,7 +43,8 @@ namespace { constexpr int max_input_tensor_dim = 3; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure( + const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) { configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon); } -void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); - auto padding_info = get_padding_info({ input, sum, output }); + auto padding_info = get_padding_info({input, sum, output}); _input = input; _sum = sum; @@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _actual_axis = wrap_around(axis, max_input_tensor_dim); _epsilon = epsilon; - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; // Set build options CLBuildOptions build_opts; @@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context // Create kernel std::string kernel_name; unsigned int idx = 0; - switch(_actual_axis) + switch (_actual_axis) { case 0: kernel_name = "l2_normalize_x"; @@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set epsilon argument - if(input->info()->data_type() == DataType::F32) + if (input->info()->data_type() == DataType::F32) { _kernel.setArg<cl_float>(idx, _epsilon); } @@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status CLL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); return Status{}; @@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue Window window_sum(window); - switch(_actual_axis) + switch (_actual_axis) { case 0: { @@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 1: @@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 2: @@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_3D_tensor_argument(idx, _sum, sum_slice); add_3D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); + } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); } break; default: diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h index edc0585217..5c9ab94ce5 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -70,7 +71,12 @@ public: * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2 * @param[in] epsilon Lower bound value for the normalization. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon); /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel. * @@ -84,7 +90,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp index dc9d68626d..e560f1de4a 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,26 +43,31 @@ using namespace misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_size_x = pool_info.pool_size.width; - const int pool_size_y = pool_info.pool_size.height; + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; const Size2D pool_size(pool_size_x, pool_size_y); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() - : _input(nullptr), _output(nullptr), _indices(nullptr) +CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr) { _type = CLKernelType::POOL; } -void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info())); - auto padding_info = get_padding_info({ input, indices, output }); + auto padding_info = get_padding_info({input, indices, output}); _input = input; _output = output; @@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices)); @@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice); add_3D_tensor_argument(idx, _indices, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h index 45481d0507..eb18a46784 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -72,7 +76,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info); // Inherited methods overridden void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp index ac33468ad8..8632bdf623 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); _run_in_place = (output == nullptr) || (output == input); - ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()); } @@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_ _input = input; _output = output; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; @@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h index a1ba2b905e..e02a3c58a3 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h @@ -66,7 +66,10 @@ public: * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output = nullptr, + float epsilon = 1e-8f); /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr, diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp index c6c4229c00..b636c485e7 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -71,9 +73,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; Window win; const DataLayout data_layout = input->data_layout(); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info); const bool is_norm_across_width = norm_idx == 0; @@ -87,15 +90,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND because of the extra output padding - const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; - const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; + const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); win = calculate_max_window(*input, Steps(vec_size_x)); // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding // Reads can occur within the valid region of the input - if(is_norm_across_width) + if (is_norm_across_width) { AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0); window_changed = window_changed || update_window_and_padding(win, input_access); @@ -112,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen else { unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - if(norm_info.is_cross_map()) + if (norm_info.is_cross_map()) { vec_size_x = 1; } win = calculate_max_window(*input, Steps(vec_size_x)); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info) +void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); @@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; - if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC) + const DataLayout data_layout = input->info()->data_layout(); + unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC) { vec_size_x = 1; vec_size_x_leftovers = 0; } - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info); _is_norm_across_width = norm_idx == 0; @@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND the extra output padding - const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; - _border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; + _border_size = BorderSize(0, border_width_right, 0, border_width_left); } const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); @@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2))); build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); - build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), + "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, + "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); // Create kernel std::string kernel_name; - if(norm_info.is_in_map()) + if (norm_info.is_in_map()) { kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout)); } @@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _config_id += support::cpp11::to_string(input->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(1)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info) +Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); return Status{}; } @@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h index 739a2ae9f1..5517ba6904 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h @@ -63,7 +63,10 @@ public: * Data layouts supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -77,7 +80,7 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp index 6b0400d50e..59352a8fb7 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp @@ -31,32 +31,35 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors"); - const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0)); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -77,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, bool window_changed = update_window_and_padding(win, input_access, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std); @@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ const DataLayout data_layout = input->info()->data_layout(); // Get number of elements to process per iterations - const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(), - input->info()->dimension(0)) : - (16 / input->info()->element_size()); + const unsigned int num_elems_processed_per_iteration = + (data_layout == DataLayout::NHWC) + ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)) + : (16 / input->info()->element_size()); const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const DataType dt = input->info()->data_type(); @@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ CLBuildOptions build_opts; build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); + build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx)))); std::string kernel_name = "normalize_planar_yuv_layer_"; - if(is_data_type_quantized(dt)) + if (is_data_type_quantized(dt)) { const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset))); @@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _config_id += support::cpp11::to_string(input->info()->dimension(2)); } -Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); } return Status{}; } @@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h index 6db4433e78..341b404e3d 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h @@ -67,7 +67,11 @@ public: * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels. * Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel * * @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels]. @@ -79,7 +83,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp index 53f313c0d3..0ac285038e 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.cpp +++ b/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -35,25 +36,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions())); - if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) + if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) { ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3); const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); - for(size_t i = 0; i < padding.size(); ++i) + for (size_t i = 0; i < padding.size(); ++i) { ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect)); ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect)); } } - if(output->total_size() > 0) + if (output->total_size() > 0) { TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); @@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLPadLayerKernel::CLPadLayerKernel() - : _input(nullptr), _output(nullptr), _4d_enabled(false) +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false) { _type = CLKernelType::ELEMENTWISE; } -void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure( + const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3); // Set build options - const DataType &data_type = input->info()->data_type(); - const unsigned int input_width = input->info()->dimension(0); - const unsigned int input_height = input->info()->dimension(1); - const unsigned int input_depth = input->info()->dimension(2); - const unsigned int pad_x_before = padding.at(0).first; - const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; - const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; - const unsigned int vec_size = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width); - const unsigned int pad_right_start = input_width + pad_x_before; - const unsigned int pad_x_before_remainder = pad_x_before % vec_size; - const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); + const DataType &data_type = input->info()->data_type(); + const unsigned int input_width = input->info()->dimension(0); + const unsigned int input_height = input->info()->dimension(1); + const unsigned int input_depth = input->info()->dimension(2); + const unsigned int pad_x_before = padding.at(0).first; + const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; + const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; + const unsigned int vec_size = adjust_vec_size( + std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), + input_width); + const unsigned int pad_right_start = input_width + pad_x_before; + const unsigned int pad_x_before_remainder = pad_x_before % vec_size; + const unsigned int vec_size_leftover_write = + vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); @@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write)); - if(padding.size() > 1) + if (padding.size() > 1) { build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before)); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); - if(padding.size() > 2) + if (padding.size() > 2) { build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before)); build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth)); @@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const } std::string kernel_name = "pad_layer_"; - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { kernel_name += "constant"; - const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); + const unsigned int vec_size_leftover_read = + vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read)); - if(pad_x_before >= vec_size) + if (pad_x_before >= vec_size) { build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size)); - build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size)); + build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + + support::cpp11::to_string(pad_right_start / vec_size)); } - if(_4d_enabled) + if (_4d_enabled) { build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first)); build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3))); @@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const const unsigned int pad_x_after_remainder = pad_right_start % vec_size; const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect; - const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); + const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect)); build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder)); - build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); - build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x)); - build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); + build_opts.add_option_if(after_pad_fact_x < output_last_x, + "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); break; } @@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode)); return Status{}; @@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); - if(_4d_enabled) + if (_4d_enabled) { add_argument<unsigned int>(idx, batch++); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h index 90af337f94..dca121b6a1 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.h +++ b/src/core/CL/kernels/CLPadLayerKernel.h @@ -56,7 +56,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + void configure(const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Set the input and output tensor. * * @param[in] compile_context The compile context to be used. @@ -68,8 +72,12 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), - PaddingMode mode = PaddingMode::CONSTANT); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel * * @param[in] input Source tensor info. Data types supported: All. @@ -80,7 +88,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp index bf1b874dd0..7dcdf1de6f 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); } @@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + const PriorBoxLayerInfo &info, + int num_priors) { ARM_COMPUTE_UNUSED(input2); // Output tensor auto initialization if not yet initialized @@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input auto_init_if_empty(*output, output_shape, 1, input1->data_type()); const unsigned int num_elems_processed_per_iteration = 4 * num_priors; - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); bool window_changed = update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios); } -void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, - cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c int img_width = info.img_size().x; int img_height = info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = input2->info()->dimension(width_idx); img_height = input2->info()->dimension(height_idx); @@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c float step_x = info.steps()[0]; float step_y = info.steps()[0]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast<float>(img_width) / layer_width; step_y = static_cast<float>(img_height) / layer_height; @@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset())); build_opts.add_option_if(info.clip(), "-DIN_PLACE"); - if(info.variances().size() > 1) + if (info.variances().size() > 1) { - for(unsigned int i = 0; i < info.variances().size(); ++i) + for (unsigned int i = 0; i < info.variances().size(); ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(i))); } } else { - for(unsigned int i = 0; i < 4; ++i) + for (unsigned int i = 0; i < 4; ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(0))); } } @@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c ICLKernel::configure_internal(win_config.second); } -Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), + output->clone().get(), info, num_priors) + .first); return Status{}; } @@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data()); - queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data()); - if(!_info.max_sizes().empty()) + queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), + _info.aspect_ratios().data()); + if (!_info.max_sizes().empty()) { queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data()); } diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h index 6c369a7a4e..a50e0c5ff5 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h @@ -57,7 +57,13 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios); + void configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -69,8 +75,14 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, - cl::Buffer *aspect_ratios); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel * * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC. @@ -80,14 +92,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input1; - const ICLTensor *_input2; + const ICLTensor *_input1; + const ICLTensor *_input2; ICLTensor *_output; PriorBoxLayerInfo _info; int _num_priors; diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp index bd573e54c8..731fcb8e04 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp @@ -22,10 +22,12 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -49,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size(); /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */ /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */ - const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration; + const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) + ? input->dimension(0) + : temp_num_elems_processed_per_iteration; // This kernel doesn't need padding Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); return std::make_pair(Status{}, win); } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output); @@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); - auto padding_info = get_padding_info({ input, weight, bias, output }); + auto padding_info = get_padding_info({input, weight, bias, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info())); @@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ int32_t output_multiplier{}; int32_t output_shift{}; const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform(); - const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); + const Status status = + quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); output_shift *= -1; // Set build options @@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); - build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMIN_BOUND=" + + support::cpp11::to_string(std::get<0>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMAX_BOUND=" + + support::cpp11::to_string(std::get<1>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); // Create kernel _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options()); @@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias); } -Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); @@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h index 31085c37ba..ba912e1d2d 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h @@ -63,7 +63,11 @@ public: * @param[in] weight Weight tensor. Data types supported: Same as @p input. * @param[in] bias Bias tensor. Data types supported: S32. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias); /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16. @@ -73,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp index 69a6fa5fa0..c97910ef79 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,24 +43,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); @@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); output->info()->set_data_layout(input->info()->data_layout()); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); _input = input; _output = output; @@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())); - build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH)))); - build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); - build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); + build_opts.add_option("-DMAX_DIM_X=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::WIDTH)))); + build_opts.add_option("-DMAX_DIM_Y=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); + build_opts.add_option("-DMAX_DIM_Z=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())); build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale())); build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); - build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); + build_opts.add_option_if(pool_info.sampling_ratio() > 0, + "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); - if(is_qasymm) + if (is_qasymm) { const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform(); @@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h index 5284a5913f..2e84e5d303 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.h +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h @@ -61,7 +61,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -77,7 +78,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -93,7 +98,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp index f6933c6cfd..1b2c414a49 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); @@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor return Status{}; } -void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { - ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); - auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); + auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); // Set instance variables _input = input; @@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())); - if(is_qasymm) + if (is_qasymm) { // Determine quantization info scale, offset UniformQuantizationInfo uqinfo = UniformQuantizationInfo(); - uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform()); + uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), + _output->info()->quantization_info().uniform()); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset)); build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale)); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h index 7b7b457632..80bfb63092 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h @@ -59,7 +59,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -74,7 +75,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -92,7 +97,10 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ICLTensor *_input; diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp index a06c2eed75..622f6210b9 100644 --- a/src/core/CL/kernels/CLRangeKernel.cpp +++ b/src/core/CL/kernels/CLRangeKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16; Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); @@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -CLRangeKernel::CLRangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRangeKernel::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step)); // Configure kernel window - unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - auto padding_info = get_padding_info({ output }); + auto padding_info = get_padding_info({output}); _start = start; _end = end; @@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DSTART=" + support::cpp11::to_string(start)); build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step)); - if(is_data_type_quantized_asymmetric(output->info()->data_type())) + if (is_data_type_quantized_asymmetric(output->info()->data_type())) { const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform(); build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset)); diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h index 1b94a099ed..65251a11e5 100644 --- a/src/core/CL/kernels/CLRangeKernel.h +++ b/src/core/CL/kernels/CLRangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLRANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index e5cfb997ca..70875a2d40 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -28,15 +28,15 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); } else { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(axis == 0); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, + "Not supported reduction operation for QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8) - && (input->data_type() != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer"); + ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && + (input->data_type() != DataType::QASYMM8) && + (input->data_type() != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), + "Not supported reduction operation, use CLArgMinMaxLayer"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _reduction_axis = axis; _op = op; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); // Set build options CLBuildOptions build_opts; DataType data_type = input->info()->data_type(); std::string data_type_promoted{}; - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { data_type_promoted = "int"; } @@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN"); build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX"); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); - - switch(op) + build_opts.add_option_if(is_data_type_quantized(data_type), + "-DOFFSET=" + + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); + build_opts.add_option_if( + is_data_type_quantized(data_type), + "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); + + switch (op) { case ReductionOperation::SUM_SQUARE: build_opts.add_option(("-DOPERATION=square_sum")); @@ -159,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte std::string kernel_axis_name; const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(axis) + switch (axis) { case 0: { @@ -187,13 +205,17 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(vec_size)); - win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); + win.set(Window::DimX, + Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -205,18 +227,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { // We use parallel reduction only in non quantized types - if(is_serial_op) + if (is_serial_op) { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + Window window_in{window}; + window_in.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - Window out_window{ window }; + Window out_window{window}; out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); Window in_slice = window_in.first_slice_window_1D(); @@ -228,8 +251,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_1D_tensor_argument(idx, _input, in_slice); add_1D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); } else { @@ -251,8 +273,9 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -262,15 +285,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -280,14 +303,13 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -298,8 +320,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h index b456378746..2f94b2add3 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.h +++ b/src/core/CL/kernels/CLReductionOperationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -67,7 +68,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * @@ -79,7 +84,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp index 3c74e80d33..9fd21943e8 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.cpp +++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLReorgLayerKernel::CLReorgLayerKernel() - : _input(nullptr), _output(nullptr) +CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); - const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + std::string kernel_name = + std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); + const size_t idx_channel = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); // Create kernel CLBuildOptions build_opts; @@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons // Configure window // auto inizialize the output tensor if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); Window win = calculate_max_window(*output->info(), Steps()); @@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride) +Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + int32_t stride) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride)); @@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h index 455a6170c6..f335071e9f 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.h +++ b/src/core/CL/kernels/CLReorgLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREORGLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 0d70ff4f3c..79a0f03b1e 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -60,8 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLReverseKernel::CLReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr) +CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -71,10 +71,13 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis); - auto padding_info = get_padding_info({ input, output, axis }); + auto padding_info = get_padding_info({input, output, axis}); _input = input; _output = output; @@ -138,7 +141,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_1D_tensor_argument(idx, _axis, axis_slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h index 4a21e4f802..fbd99dc883 100644 --- a/src/core/CL/kernels/CLReverseKernel.h +++ b/src/core/CL/kernels/CLReverseKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis); /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel * diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp index c0e014e8b8..703c64d8d3 100644 --- a/src/core/CL/kernels/CLSelectKernel.cpp +++ b/src/core/CL/kernels/CLSelectKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen } } // namespace -CLSelectKernel::CLSelectKernel() - : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) +CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) { _type = CLKernelType::ELEMENTWISE; } -void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelectKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info())); @@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC _output = output; _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions()); - auto padding_info = get_padding_info({ c, x, y, output }); + auto padding_info = get_padding_info({c, x, y, output}); const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0)); const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x; @@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC // Create kernel std::string kernel_name = "select"; - if(_has_same_rank) + if (_has_same_rank) { kernel_name += "_same_rank"; } else { const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2; - if(is_input_rank_greater_than_two) + if (is_input_rank_greater_than_two) { const size_t width = x->info()->tensor_shape().x(); const size_t height = x->info()->tensor_shape().y(); @@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output)); return Status{}; @@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); - if(!_has_same_rank) + if (!_has_same_rank) { Window vector_slice = window.first_slice_window_1D(); vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu do { unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor(); - if(_has_same_rank) + if (_has_same_rank) { add_3D_tensor_argument(idx, _c, slice); } @@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h index b8c10cd7cf..c4256fd743 100644 --- a/src/core/CL/kernels/CLSelectKernel.h +++ b/src/core/CL/kernels/CLSelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,11 @@ public: * @param[out] y Second input tensor. Data types supported: Same as @p x * @param[in] output Output tensor. Data types supported: Same as @p x. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel * * @param[in] c Condition input tensor. Data types supported: U8. diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp index 3632ae2b03..f4c0839ad2 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,19 +39,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); - auto padding_info = get_padding_info({ input, block_shape, paddings, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + auto padding_info = get_padding_info({input, block_shape, paddings, output}); _input = input; _block_shape = block_shape; @@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch))); - _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); @@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x())); build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y())); build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y())); - _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h index 4817cfeef2..f9dce9db47 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32 * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -73,7 +78,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output); + void configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] compile_context The compile context to be used. @@ -84,8 +94,13 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -95,7 +110,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -107,7 +125,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp index c5ffdb588b..25662b5c62 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); @@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h index bb1ac5f9a6..d0932919e0 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp index 075c93ab60..23e26716e7 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.cpp +++ b/src/core/CL/kernels/CLStackLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); @@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -73,18 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi } } // namespace -CLStackLayerKernel::CLStackLayerKernel() - : _input(nullptr), _output(nullptr) +CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output); } -void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons _kernel.setArg<cl_uint>(idx, idx_input); } -Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status CLStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h index 2865127a90..d3c17f529c 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.h +++ b/src/core/CL/kernels/CLStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); /** Initialise the kernel's inputs and output * * @note Supported input tensor rank: up to 4 @@ -74,7 +76,12 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -88,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp index 9acbafdb19..a8f6112820 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -22,11 +22,13 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLStridedSliceKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/bit_ops.h" @@ -37,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel() _type = CLKernelType::ELEMENTWISE; } -void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + auto padding_info = get_padding_info({input, output}); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); const TensorShape &input_shape = input->tensor_shape(); Coordinates starts_abs; Coordinates ends_abs; Coordinates final_strides; - std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(starts_abs, ends_abs, final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); Window win = calculate_max_window(*output, Steps()); @@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0); // Update window if needed - if(multi_access_x) + if (multi_access_x) { Window &updated_window = win; updated_window.set(Window::DimX, - Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); + Window::Dimension(updated_window.x().start(), + ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); // Create build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i); - build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i])); - build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i])); + build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(starts_abs[i])); + build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(final_strides[i])); build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i)); } - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max<int>(output_width_x - vec_size_x, 0))); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option_if_else(input_shape.num_dimensions() > 2, - "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), - "-DSRC_DEPTH=1"); + "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1"); build_opts.add_option_if_else(output->num_dimensions() > 2, "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()), "-DDST_DEPTH=1"); @@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co _config_id = "strided_slice"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->data_type())); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->dimension(i)); @@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); return Status{}; } @@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = window_collapsed.first_slice_window_4D(); @@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_4D(slice)); + } while (window_collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h index 4c201504f5..1cf5bcacec 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.h +++ b/src/core/CL/kernels/CLStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" #include <cstdint> @@ -53,9 +54,15 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -71,9 +78,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp index 3e7015cfd2..fa996c4008 100644 --- a/src/core/CL/kernels/CLTileKernel.cpp +++ b/src/core/CL/kernels/CLTileKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLTileKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLTileKernel::CLTileKernel() - : _input(nullptr), _output(nullptr) +CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTileKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT // Configure window without padding Window win = calculate_max_window(*output->info()); - if(multi_access_x) + if (multi_access_x) { // If multi-access is enabled, no thread should cross the tile boundaries. This means we need // as many threads as those to cover a single tile times multiples[0]. Note that if threads // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and // we don't need to pad the output const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0]; - win.set(Window::DimX, - Window::Dimension(win.x().start(), size_win_x, vec_size_x)); + win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x)); } ICLKernel::configure_internal(win); @@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT _config_id = "tile"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->info()->data_type())); - for(unsigned int i = 0; i < multiples.size(); ++i) + for (unsigned int i = 0; i < multiples.size(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(i)); @@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h index 41752ca90b..c3486aecef 100644 --- a/src/core/CL/kernels/CLTileKernel.h +++ b/src/core/CL/kernels/CLTileKernel.h @@ -64,7 +64,10 @@ public: * @param[out] output Destination tensor. Same as @p input * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples); /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel * * @param[in] input Source tensor info. Data type supported: All. |